In [1]:
import os
import pandas as pd

In [2]:
# --- Step 1: Load full dataset ---
file_path = 'C:/Users/anhuy/CS-5806/Project/data/raw'
os.chdir(file_path)

df_full = pd.read_csv('final_uhie_ulti.csv', low_memory=False)

In [3]:
# --- Step 2: Preserve lat/lon for post-processing (mapping) ---
lat_lon = df_full[['lat', 'lon']]

# --- Step 3: Define columns to drop ---
cols_to_drop = ['GEOID', 'lat', 'lon', 'area', 'geometry']

# --- Step 4: Add original index for spatial traceability ---
df_full['original_index'] = df_full.index

# --- Step 5: Drop unnecessary columns ---
df = df_full.drop(columns=cols_to_drop)

print(f"Initial full dataset shape: {df.shape}")

# --- Step 6: Replace bad entries with NaN ---
df = df.replace('-', pd.NA)

# --- Step 7: Convert everything to numeric ---
df = df.apply(pd.to_numeric, errors='coerce')

# --- Step 8: Rolling Window Imputation (on full dataset) ---
rolling_window_size = 10  # 5 before + center + 5 after
df_filled = df.copy()

for col in df_filled.columns:
    df_filled[col] = df_filled[col].fillna(
        df_filled[col].rolling(window=rolling_window_size, center=True, min_periods=1).mean()
    )

print(f"After rolling window fill, dataset shape: {df_filled.shape}")

# --- Step 9: Full Cleaned Dataset (drop NaNs) ---
num_remaining_nan_full = df_filled.isnull().any(axis=1).sum()
print(f"Number of rows with missing values after rolling fill (full df_filled): {num_remaining_nan_full}")

df_full_cleaned = df_filled.dropna()
lat_lon_full_cleaned = lat_lon.loc[df_full_cleaned.index]
original_index_full_cleaned = df_full.loc[df_full_cleaned.index, 'original_index']

print(f"Shape of full cleaned dataset after dropping NaNs: {df_full_cleaned.shape}")

# Separate features and target
y_full_cleaned = df_full_cleaned['UHII']
X_full_cleaned = df_full_cleaned.drop(columns=['UHII', 'original_index'])

# --- Step 10: Subsampling from df_filled ---
subsample_rate = 10
df_subsampled = df_filled.iloc[::subsample_rate, :]
lat_lon_subsampled = lat_lon.loc[df_subsampled.index]
original_index_subsampled = df_full.loc[df_subsampled.index, 'original_index']

print(f"After subsampling from df_filled, dataset shape: {df_subsampled.shape}")

# --- Step 11: Drop any NaNs after subsampling ---
num_remaining_nan_subsampled = df_subsampled.isnull().any(axis=1).sum()
print(f"Number of rows with missing values after subsampling: {num_remaining_nan_subsampled}")

if num_remaining_nan_subsampled > 0:
    df_subsampled = df_subsampled.dropna()
    lat_lon_subsampled = lat_lon_subsampled.loc[df_subsampled.index]
    original_index_subsampled = original_index_subsampled.loc[df_subsampled.index]
    print(f"After dropping NaNs from subsampled dataset, new shape: {df_subsampled.shape}")
else:
    print("No NaN rows to drop after subsampling.")

# Separate features and target
y_subsampled = df_subsampled['UHII']
X_subsampled = df_subsampled.drop(columns=['UHII', 'original_index'])

print(f"Final feature matrix after subsampling shape: {X_subsampled.shape}")
print(f"Final target vector after subsampling shape: {y_subsampled.shape}")

# --- Step 12: Save datasets ---
save_path = 'C:/Users/anhuy/CS-5806/Project/data/processed'
os.makedirs(save_path, exist_ok=True)

# Save full cleaned
X_full_cleaned.to_csv(os.path.join(save_path, 'X_full_cleaned.csv'), index=False)
y_full_cleaned.to_csv(os.path.join(save_path, 'y_full_cleaned.csv'), index=False)
lat_lon_full_cleaned.to_csv(os.path.join(save_path, 'lat_lon_full_cleaned.csv'), index=False)
original_index_full_cleaned.to_csv(os.path.join(save_path, 'original_index_full_cleaned.csv'), index=False)

# Save subsampled cleaned
X_subsampled.to_csv(os.path.join(save_path, 'X_subsampled.csv'), index=False)
y_subsampled.to_csv(os.path.join(save_path, 'y_subsampled.csv'), index=False)
lat_lon_subsampled.to_csv(os.path.join(save_path, 'lat_lon_subsampled.csv'), index=False)
original_index_subsampled.to_csv(os.path.join(save_path, 'original_index_subsampled.csv'), index=False)

print("Saved all files: X, y, lat_lon, and original_index for both full cleaned and subsampled datasets!")

Initial full dataset shape: (232651, 32)
After rolling window fill, dataset shape: (232651, 32)
Number of rows with missing values after rolling fill (full df_filled): 6333
Shape of full cleaned dataset after dropping NaNs: (226318, 32)
After subsampling from df_filled, dataset shape: (23266, 32)
Number of rows with missing values after subsampling: 628
After dropping NaNs from subsampled dataset, new shape: (22638, 32)
Final feature matrix after subsampling shape: (22638, 30)
Final target vector after subsampling shape: (22638,)
Saved all files: X, y, lat_lon, and original_index for both full cleaned and subsampled datasets!
