In [1]:
import pandas as pd
import numpy as np

In [3]:
# Loading tract-level environmental data
tract_df = pd.read_csv(r"C:\Users\Elias\Final Project\Cleaned output data files\final_avg_EJ_LA_2018_2022.csv", dtype={"ID": str})

# Loading crosswalk file
crosswalk = pd.read_csv(r"C:\Users\Elias\Final Project\Crosswalk data files\ZIP_TRACT_122022.csv", dtype={"TRACT": str, "ZIP": str, "RES_RATIO": float})

crosswalk.shape

(172154, 8)

In [None]:
# Format tract IDs to 11 digits (removing last digit for block group → tract)
tract_df["ID"] = tract_df["ID"].astype(str).str.strip()
tract_df['TRACT'] = tract_df['ID'].str[:11]  # Keep first 11 digits

# Format crosswalk
crosswalk['TRACT'] = crosswalk['TRACT'].astype(str).str.strip()
crosswalk['ZIP'] = crosswalk['ZIP'].astype(str).str.strip()

print(f"Tract-level environmental data: {len(tract_df)} rows")
print(f"Unique tracts: {tract_df['TRACT'].nunique()}")
print(f"Crosswalk data: {len(crosswalk)} rows")

In [19]:
# Filter crosswalk to LA County only (FIPS 06037)
crosswalk_la = crosswalk[crosswalk['TRACT'].str.startswith('06037')].copy()

print(f"Crosswalk after LA filter: {len(crosswalk_la)} rows")
print(f"Unique tracts in crosswalk: {crosswalk_la['TRACT'].nunique()}")
print(f"Unique ZIP codes: {crosswalk_la['ZIP'].nunique()}")

Crosswalk after LA filter: 4052 rows
Unique tracts in crosswalk: 2342
Unique ZIP codes: 482


In [20]:
# Merge tract environmental data with crosswalk
merged = tract_df.merge(crosswalk_la, on="TRACT", how="inner")

print(f"\nAfter merging tract data with crosswalk:")
print(f"Total rows: {len(merged)}")
print(f"Unique tracts: {merged['TRACT'].nunique()}")
print(f"Unique ZIP codes: {merged['ZIP'].nunique()}")

print(f"\nColumns in merged data: {list(merged.columns)}")


After merging tract data with crosswalk:
Total rows: 11288
Unique tracts: 2341
Unique ZIP codes: 482

Columns in merged data: ['ID', 'avg_traffic_pct', 'avg_diesel_pm', 'avg_cancer_risk', 'avg_resp_hazard', 'avg_ej_index', 'TRACT', 'ZIP', 'USPS_ZIP_PREF_CITY', 'USPS_ZIP_PREF_STATE', 'RES_RATIO', 'BUS_RATIO', 'OTH_RATIO', 'TOT_RATIO']


In [21]:
# Environmental variables to aggregate
env_vars = ["avg_traffic_pct","avg_diesel_pm","avg_cancer_risk","avg_resp_hazard","avg_ej_index"]

# Create weighted versions
for col in env_vars:
    merged[col + "_weighted"] = merged[col] * merged["RES_RATIO"]

# Aggregate to ZIP level using weighted averages
weighted_sum = merged.groupby("ZIP")[[col + "_weighted" for col in env_vars]].sum()
weight_sum = merged.groupby("ZIP")["RES_RATIO"].sum()

In [22]:
# Environmental variables to aggregate
env_vars = ["avg_traffic_pct","avg_diesel_pm","avg_cancer_risk","avg_resp_hazard","avg_ej_index"]

# For each ZIP code, calculate: SUM(variable × res_ratio) / SUM(res_ratio)
zip_level_data = []

for zip_code in merged['ZIP'].unique():
    zip_data = merged[merged['ZIP'] == zip_code]
    
    zip_result = {'ZIP_CODE': zip_code}
    
    for var in env_vars:
        # Calculate weighted average: SUM(value × weight) / SUM(weight)
        weighted_sum = (zip_data[var] * zip_data['RES_RATIO']).sum()
        total_weight = zip_data['RES_RATIO'].sum()
        
        zip_result[var] = weighted_sum / total_weight if total_weight > 0 else np.nan
    
    zip_level_data.append(zip_result)

# Convert to DataFrame
zip_results = pd.DataFrame(zip_level_data)

print(f"Total ZIP codes: {len(zip_results)}")
print(f"\nFirst 5 ZIP codes:")
print(zip_results.head())

Calculating ZIP-level weighted averages...

✓ ZIP-level aggregation complete!
Total ZIP codes: 482

First 5 ZIP codes:
  ZIP_CODE  avg_traffic_pct  avg_diesel_pm  avg_cancer_risk  avg_resp_hazard  \
0    91042        65.539718       0.388687        35.871372         0.740438   
1    91043        74.585335       0.447717        37.362315         0.834184   
2    91214        74.068027       0.387477        34.997439         0.690094   
3    91040        71.159041       0.386752        34.895258         0.676467   
4    91352        83.744344       0.658342        40.595179         0.996814   

   avg_ej_index  
0      0.353111  
1      0.575068  
2      0.282986  
3      0.322207  
4      0.614218  


In [23]:
# KNN Imputation for missing values

from sklearn.impute import KNNImputer

# Check missing values before imputation
print("Missing values before imputation:")
missing_before = zip_results[env_vars].isnull().sum()
missing_pct = (missing_before / len(zip_results)) * 100
for var, count, pct in zip(env_vars, missing_before, missing_pct):
    if count > 0:
        print(f"  {var}: {count} ({pct:.1f}%)")

if missing_before.sum() > 0:
    # Apply KNN imputation
    print("\nApplying KNN imputation with 10 neighbors...")
    imputer = KNNImputer(n_neighbors=10, weights='distance')
    zip_results[env_vars] = imputer.fit_transform(zip_results[env_vars])
    
    print("Missing values after imputation:", zip_results[env_vars].isnull().sum().sum())
else:
    print("No missing values found")

Missing values before imputation:
  avg_traffic_pct: 45 (9.3%)
  avg_diesel_pm: 45 (9.3%)
  avg_cancer_risk: 45 (9.3%)
  avg_resp_hazard: 45 (9.3%)
  avg_ej_index: 45 (9.3%)

Applying KNN imputation with 10 neighbors...
Missing values after imputation: 0


In [25]:
# Save final ZIP-level environmental data
output_file = r"C:\Users\Elias\Final Project\Cleaned output data files\final_ZIP_level_EJ_weighted.csv"
zip_results.to_csv(output_file, index=False)