In [57]:
import pandas as pd
import glob
import os

In [58]:
folder = r"C:\Users\Elias\OneDrive\Desktop\EJscreen data"

# Find all CSV files in folder
files = glob.glob(os.path.join(folder, "*.csv"))

files


['C:\\Users\\Elias\\OneDrive\\Desktop\\EJscreen data\\EJSCREEN_2019_USPR.csv',
 'C:\\Users\\Elias\\OneDrive\\Desktop\\EJscreen data\\EJSCREEN_2020_USPR.csv',
 'C:\\Users\\Elias\\OneDrive\\Desktop\\EJscreen data\\EJSCREEN_2021_USPR.csv',
 'C:\\Users\\Elias\\OneDrive\\Desktop\\EJscreen data\\EJSCREEN_2022_Supplemental_with_AS_CNMI_GU_VI.csv',
 'C:\\Users\\Elias\\OneDrive\\Desktop\\EJscreen data\\EJSCREEN_Full_USPR_2018.csv']

In [59]:
# Columns we want to keep from each file
cols_needed = [
    "ID",
    "P_PTRAF",   # Traffic proximity percentile
    "DSLPM",     # Diesel PM
    "CANCER",    # Cancer risk
    "RESP",      # Respiratory hazard
    "VULEOPCT"   # EJ index
]

# Renaming
rename_map = {
    "P_PTRAF": "traffic_pct",
    "DSLPM": "diesel_pm",
    "CANCER": "cancer_risk",
    "RESP": "resp_hazard",
    "VULEOPCT": "ej_index"
}


In [60]:
df_list = []

for file in files:
    print("Processing:", os.path.basename(file))

    df = pd.read_csv(file, dtype=str, low_memory=False)

    # Add YEAR from filename + extracting 2018-2022
    year = "".join([c for c in os.path.basename(file) if c.isdigit()][:4])

    # Ensure ID is string
    df["ID"] = df["ID"].astype(str)

    # Filter LA County census tracts
    df_la = df[df["ID"].str.startswith("06037")]

    # Keep only needed columns that exist
    existing_cols = [c for c in cols_needed if c in df_la.columns]
    df_la = df_la[existing_cols]

    # Rename columns
    df_la = df_la.rename(columns=rename_map)

    # Add YEAR column
    df_la["YEAR"] = year

    df_list.append(df_la)

Processing: EJSCREEN_2019_USPR.csv
Processing: EJSCREEN_2020_USPR.csv
Processing: EJSCREEN_2021_USPR.csv
Processing: EJSCREEN_2022_Supplemental_with_AS_CNMI_GU_VI.csv
Processing: EJSCREEN_Full_USPR_2018.csv


In [61]:
final_df_la = pd.concat(df_list, ignore_index=True)

# Sort by YEAR ascending
final_df_la = final_df_la.sort_values("YEAR")

print(final_df_la.shape)
print(final_df_la["YEAR"].value_counts())
final_df_la.head(10)


(32291, 7)
YEAR
2022    6591
2018    6425
2019    6425
2020    6425
2021    6425
Name: count, dtype: int64


Unnamed: 0,ID,traffic_pct,diesel_pm,cancer_risk,resp_hazard,ej_index,YEAR
32290,60379903000,2.62398958718,,,,0.0,2018
28001,60372671003,94.5230563579,1.7646188352,62.2659313342,3.63433626283,0.38050432819,2018
28002,60372671004,93.7239369808,1.7646188352,62.2659313342,3.63433626283,0.0690376569038,2018
28003,60372672001,97.1720457754,1.9700322375,62.6903990256,3.72178337069,0.240613777342,2018
28004,60372672002,95.628985014,1.9700322375,62.6903990256,3.72178337069,0.180047225502,2018
28005,60372672003,98.4694066636,1.9700322375,62.6903990256,3.72178337069,0.270300333704,2018
28006,60372673001,99.8683056816,2.2297021761,67.8412992137,4.10184593457,0.543951042285,2018
28007,60372673002,97.8820068657,2.2297021761,67.8412992137,4.10184593457,0.403755127801,2018
28008,60372674021,94.2021246971,1.9556723774,66.683872333,3.79813530248,0.305177111717,2018
28009,60372674022,95.7785272505,1.9556723774,66.683872333,3.79813530248,0.33912793579,2018


In [62]:
#saving file
final_df_la.to_csv("combined_EJ_LA_2018_2022.csv", index=False)


In [63]:
# importing file to avoid data loss + 
import pandas as pd

# Load the combined LA dataset you already saved
df = pd.read_csv("combined_EJ_LA_2018_2022.csv")


In [64]:
num_cols = [
    "traffic_pct",
    "diesel_pm",
    "cancer_risk",
    "resp_hazard",
    "ej_index"
]

# Convert to numeric (coerce errors â†’ NaN)
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")


In [65]:
# Group by census tract ID and compute means
avg_df = df.groupby("ID")[num_cols].mean().reset_index()

In [66]:
#converting ID to string
avg_df["ID"] = avg_df["ID"].astype(str).str.zfill(12)

avg_df[["ID"]].head()

# renaming columns
avg_df = avg_df.rename(columns={
    "traffic_pct": "avg_traffic_pct",
    "diesel_pm": "avg_diesel_pm",
    "cancer_risk": "avg_cancer_risk",
    "resp_hazard": "avg_resp_hazard",
    "ej_index": "avg_ej_index"
})

# sorting data in ascending order by ID
avg_df = avg_df.sort_values("ID")

In [67]:
# Count missing values for each column
avg_df.isna().sum()


ID                  0
avg_traffic_pct     4
avg_diesel_pm      14
avg_cancer_risk    14
avg_resp_hazard    14
avg_ej_index        0
dtype: int64

In [68]:
avg_df[avg_df.isna().any(axis=1)]

Unnamed: 0,ID,avg_traffic_pct,avg_diesel_pm,avg_cancer_risk,avg_resp_hazard,avg_ej_index
2570,60372653012,,0.404676,30.0,0.4,0.0
3316,60373200001,3.446239,,,,0.0
7680,60379800011,6.964159,,,,0.0
7681,60379800021,35.297493,,,,0.0
7684,60379800051,34.630826,,,,0.0
7685,60379800061,32.297493,,,,0.0
7686,60379800071,26.297493,,,,0.0
7691,60379800121,,0.53489,30.0,0.4,0.0
7697,60379800181,99.204236,,,,0.0
7699,60379800201,16.297493,,,,0.0


In [69]:
# filling missing values using spatial nearest-neighbor imputation + county-wide median

# list of columns to impute using county-wide median values
median_cols = ["avg_diesel_pm", "avg_cancer_risk", "avg_resp_hazard"]

# spatial imputation for avg_traffic_pct using nearest neighbors

def fill_with_neighbors(series):
    series = series.copy()
    for i, val in series.items():
        if pd.isna(val):
            neighbors = []

            # previous neighbor
            if i > 0 and not pd.isna(series.iloc[i-1]):
                neighbors.append(series.iloc[i-1])

            # next neighbor
            if i < len(series)-1 and not pd.isna(series.iloc[i+1]):
                neighbors.append(series.iloc[i+1])

            # fill only if neighbors exist
            if neighbors:
                series.iloc[i] = sum(neighbors) / len(neighbors)

    return series

avg_df["avg_traffic_pct"] = fill_with_neighbors(avg_df["avg_traffic_pct"])

# imputation using county-wide median values


for col in median_cols:
    med = avg_df[col].median()
    avg_df[col] = avg_df[col].fillna(med)


In [70]:
# Check remaining missing values after imputation
avg_df.isna().sum()


ID                 0
avg_traffic_pct    0
avg_diesel_pm      0
avg_cancer_risk    0
avg_resp_hazard    0
avg_ej_index       0
dtype: int64

In [71]:
avg_df.to_csv("final_avg_EJ_LA_2018_2022.csv", index=False)
print(avg_df.shape)

(7719, 6)
