In [11]:
import os
import glob
import pandas as pd
import numpy as np

In [12]:
csv_folder = r"C:\Users\Elias\Final Project\EJ_screen_data_files"

In [13]:
# Columns we want to keep from each file
cols_needed = [
    "ID",
    "P_PTRAF",   # Traffic proximity percentile
    "DSLPM",     # Diesel PM
    "CANCER",    # Cancer risk
    "RESP",      # Respiratory hazard
    "VULEOPCT"   # EJ index
]

# Renaming
rename_map = {
    "P_PTRAF": "traffic_pct",
    "DSLPM": "diesel_pm",
    "CANCER": "cancer_risk",
    "RESP": "resp_hazard",
    "VULEOPCT": "ej_index"
}

numeric_cols = ["traffic_pct", "diesel_pm", "cancer_risk", "resp_hazard", "ej_index"]

In [14]:
def load_and_combine_files():
    """Load all EJScreen CSV files and combine them"""
    
    print("Step 1: Loading EJScreen Files")
   
    csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))
    print(f"\nFound {len(csv_files)} files:")
    for f in csv_files:
        print(f"  - {os.path.basename(f)}")
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {csv_folder}")
    
    df_list = []
    
    for file in csv_files:
        print(f"\nProcessing: {os.path.basename(file)}")
        
        # Read file as string dtype to avoid mixed type warnings
        df = pd.read_csv(file, dtype=str, low_memory=False)
        
        # Check what columns exist
        print(f"  Available columns: {list(df.columns[:10])}...")  # Show first 10 columns
        
        # Check if ID column exists
        if "ID" not in df.columns:
            print(f" 'ID' column not found, skipping this file")
            continue
        
        # Extract year from filename (e.g., "2018" from "EJSCREEN_2018_...")
        year = "".join([c for c in os.path.basename(file) if c.isdigit()])[:4]
        
        # Filter for LA County census tracts (FIPS code starts with 06037)
        df = df[df["ID"].str.startswith("06037", na=False)]
        
        # Keep only needed columns that exist
        existing_cols = [c for c in cols_needed if c in df.columns]
        print(f"  Columns found: {existing_cols}")
        
        if len(existing_cols) < 2:  # At least ID + 1 data column
            print(f" Not enough required columns found, skipping this file")
            continue
        
        df = df[existing_cols]
        
        # Rename columns
        df = df.rename(columns= rename_map)
        
        # Add year column
        df["YEAR"] = year
        
        print(f"  Tracts found: {len(df)}")
        df_list.append(df)
    
    if not df_list:
        raise ValueError("No valid data files were processed")
    
    # Combine all years
    combined_df = pd.concat(df_list, ignore_index=True)
    print(f"\n Combined dataset: {combined_df.shape[0]} rows, {combined_df.shape[1]} columns")
    print(f"\nYears included:")
    print(combined_df["YEAR"].value_counts().sort_index())
    
    return combined_df

def convert_to_numeric(df):
    """Converting numeric columns from string to float"""
    print("Step 2: Converting to numeric")
    
    for col in numeric_cols:
        if col in df.columns:
            before_na = df[col].isna().sum()
            df[col] = pd.to_numeric(df[col], errors="coerce")
            after_na = df[col].isna().sum()
            
            if after_na > before_na:
                print(f"\n{col}:")
                print(f"  New NaN values from conversion: {after_na - before_na}")
    
    print("\n All numeric columns converted")
    return df

In [18]:
def calculate_tract_averages(df):
    """Calculating 2018-2022 average for each census tract"""
    print("Step 3: Calculating Tract Averages (2018-2022)")
    
    avg_df = df.groupby("ID")[numeric_cols].mean().reset_index()
    
    print(f" Unique tracts: {len(avg_df)}")
    
    # Renaming to indicate these are averages
    avg_df = avg_df.rename(columns={
        "traffic_pct": "avg_traffic_pct",
        "diesel_pm": "avg_diesel_pm",
        "cancer_risk": "avg_cancer_risk",
        "resp_hazard": "avg_resp_hazard",
        "ej_index": "avg_ej_index"
    })
    
    return avg_df

def format_tract_ids(df):
    """Ensuring tract IDs are properly formatted as 12-digit strings"""
    print("\n" + "="*70)
    print("Step 4: Formating Tract IDs")
    print("="*70)
    
    print(f"\nBefore formatting - ID examples:")
    print(df["ID"].head(3).tolist())
    
    # Convert to string and pad with zeros to 12 digits
    df["ID"] = df["ID"].astype(str).str.zfill(12)
    
    print(f"\nAfter formatting - ID examples:")
    print(df["ID"].head(3).tolist())
    
    # Sort by tract ID
    df = df.sort_values("ID").reset_index(drop=True)
    
    print(f"\n All tract IDs formatted and sorted")
    return df

In [16]:
def handle_missing_values(df):
    """Handling missing values"""
    print("Step 5: Handling Missing Values")
    avg_cols = ["avg_traffic_pct", "avg_diesel_pm", "avg_cancer_risk", 
                "avg_resp_hazard", "avg_ej_index"]
    
    print("\nMissing values before imputation:")
    for col in avg_cols:
        n_missing = df[col].isna().sum()
        pct_missing = (n_missing / len(df)) * 100
        print(f"  {col}: {n_missing} ({pct_missing:.2f}%)")
    
    # Spatial nearest-neighbor imputation for missing traffic
    print("\nApplying spatial nearest-neighbor imputation for avg_traffic_pct...")
    df["avg_traffic_pct"] = fill_with_neighbors(df["avg_traffic_pct"])
    
    # County-wide median for other variables
    print("\nApplying county-wide median imputation for other variables...")
    median_cols = ["avg_diesel_pm", "avg_cancer_risk", "avg_resp_hazard"]
    
    for col in median_cols:
        median_val = df[col].median()
        n_filled = df[col].isna().sum()
        df[col] = df[col].fillna(median_val)
        print(f"  {col}: filled {n_filled} values with median = {median_val:.2f}")
    
    print("\nMissing values after imputation:")
    for col in avg_cols:
        n_missing = df[col].isna().sum()
        print(f"  {col}: {n_missing}")
    
    if df[avg_cols].isna().any().any():
        print("\nMissing values remain")
    else:
        print("\n No missing values")
    
    return df

def fill_with_neighbors(series):
    """Fill missing values using spatial nearest neighbors (adjacent tracts)"""
    series = series.copy()
    
    for i in range(len(series)):
        if pd.isna(series.iloc[i]):
            neighbors = []
            
            # Check previous tract
            if i > 0 and not pd.isna(series.iloc[i-1]):
                neighbors.append(series.iloc[i-1])
            
            # Check next tract
            if i < len(series)-1 and not pd.isna(series.iloc[i+1]):
                neighbors.append(series.iloc[i+1])
            
            # Fill with average of available neighbors
            if neighbors:
                series.iloc[i] = np.mean(neighbors)
    
    return series

In [17]:
# Execute pipeline
combined_df = load_and_combine_files()
combined_df = convert_to_numeric(combined_df)
avg_df = calculate_tract_averages(combined_df)
avg_df = format_tract_ids(avg_df)
avg_df = handle_missing_values(avg_df)

# Save final output
avg_df.to_csv("final_avg_EJ_LA_2018_2022.csv", index=False)
print(avg_df.shape)

Step 1: Loading EJScreen Files

Found 5 files:
  - EJSCREEN_2019_USPR.csv
  - EJSCREEN_2020_USPR.csv
  - EJSCREEN_2021_USPR.csv
  - EJSCREEN_2022_Supplemental_with_AS_CNMI_GU_VI.csv
  - EJSCREEN_Full_USPR_2018.csv

Processing: EJSCREEN_2019_USPR.csv
  Available columns: ['OBJECTID', 'ID', 'ACSTOTPOP', 'ACSIPOVBAS', 'ACSEDUCBAS', 'ACSTOTHH', 'ACSTOTHU', 'MINORPOP', 'MINORPCT', 'LOWINCOME']...
  Columns found: ['ID', 'P_PTRAF', 'DSLPM', 'CANCER', 'RESP', 'VULEOPCT']
  Tracts found: 6425

Processing: EJSCREEN_2020_USPR.csv
  Available columns: ['OBJECTID', 'ID', 'ACSTOTPOP', 'ACSIPOVBAS', 'ACSEDUCBAS', 'ACSTOTHH', 'ACSTOTHU', 'MINORPOP', 'MINORPCT', 'LOWINCOME']...
  Columns found: ['ID', 'P_PTRAF', 'DSLPM', 'CANCER', 'RESP', 'VULEOPCT']
  Tracts found: 6425

Processing: EJSCREEN_2021_USPR.csv
  Available columns: ['OBJECTID', 'ID', 'ACSTOTPOP', 'ACSIPOVBAS', 'ACSEDUCBAS', 'ACSTOTHH', 'ACSTOTHU', 'ACSUNEMPBAS', 'MINORPOP', 'MINORPCT']...
  Columns found: ['ID', 'P_PTRAF', 'DSLPM', 'CANCER

In [None]:
combined_df.shape()