In [17]:
import pandas as pd
import numpy as np

In [18]:
aggregated_birth_data = pd.read_csv(r"C:\Users\Elias\Final Project\Cleaned output data files\cleaned_birth_data_aggregated.csv")
final_ZIP_level_EJ_weighted = pd.read_csv(r"C:\Users\Elias\Final Project\Cleaned output data files\final_ZIP_level_EJ_weighted.csv")
merged_svi_acs_data = pd.read_csv(r"C:\Users\Elias\Final Project\Cleaned output data files\merged_svi_acs_data.csv")

print(f"Birth data (aggregated): {aggregated_birth_data.shape}")
print(f"EJ environmental data: {final_ZIP_level_EJ_weighted.shape}")
print(f"ACS + SVI data: {merged_svi_acs_data.shape}")

Birth data (aggregated): (267, 4)
EJ environmental data: (482, 6)
ACS + SVI data: (1803, 19)


In [19]:
# Renaming all ZIP  code columns to the same name
final_ZIP_level_EJ_weighted = final_ZIP_level_EJ_weighted.rename(columns={'ZIP_CODE': 'ZIP_Code_of_Residence'})
merged_svi_acs_data = merged_svi_acs_data.rename(columns={'FIPS': 'ZIP_Code_of_Residence'})

# Merging
master_data = aggregated_birth_data.merge(final_ZIP_level_EJ_weighted, on='ZIP_Code_of_Residence', how='inner')\
                       .merge(merged_svi_acs_data, on='ZIP_Code_of_Residence', how='inner')

In [20]:
# Check ZIP code overlap
birth_zips = set(aggregated_birth_data['ZIP_Code_of_Residence'])
ejscreen_zips = set(final_ZIP_level_EJ_weighted['ZIP_Code_of_Residence'])
common_zips = birth_zips.intersection(ejscreen_zips)

print(f"Birth ZIPs: {len(birth_zips)}")
print(f"EJScreen ZIPs: {len(ejscreen_zips)}")
print(f"Common ZIPs: {len(common_zips)}")

# Check SVI/ACS ZIP coverage
birth_zips = set(aggregated_birth_data['ZIP_Code_of_Residence'])
svi_acs_zips = set(merged_svi_acs_data['ZIP_Code_of_Residence'])  
common_with_svi = birth_zips.intersection(svi_acs_zips)

print(f"Birth data ZIPs: {len(birth_zips)}")
print(f"SVI/ACS data ZIPs: {len(svi_acs_zips)}")
print(f"Common ZIPs: {len(common_with_svi)}")

Birth ZIPs: 267
EJScreen ZIPs: 482
Common ZIPs: 264
Birth data ZIPs: 267
SVI/ACS data ZIPs: 1803
Common ZIPs: 253


In [21]:
print("MASTER DATA INFO")

# 1. Basic Information
print("\n1. BASIC INFORMATION")
print(f"Shape: {master_data.shape} (rows, columns)")

# 2. First few rows
print("\n2. FIRST 5 ROWS")
print(master_data.head())

# 3. Column names and data types
print("\n3. COLUMN NAMES AND DATA TYPES")
print(master_data.dtypes)

# 4. Duplicate rows check
print("\n4. DUPLICATE ROWS CHECK")
duplicate_count = master_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
if duplicate_count > 0:
    print("Warning: Duplicates found!")
else:
    print(" No duplicates")
# 5. Merge success check
print("\n5. MERGE SUCCESS CHECK")
print(f"Original birth data rows: {len(aggregated_birth_data)}")
print(f"Original EJ data rows: {len(final_ZIP_level_EJ_weighted)}")
print(f"Original SVI/ACS data rows: {len(merged_svi_acs_data)}")
print(f"Final master data rows: {len(master_data)}")
print(f"Rows lost in merge: {len(aggregated_birth_data) - len(master_data)}")


MASTER DATA INFO

1. BASIC INFORMATION
Shape: (252, 27) (rows, columns)

2. FIRST 5 ROWS
   ZIP_Code_of_Residence  Total_Births_2018_2022  Total_LBW_Count_2018_2022  \
0                  90001                  3936.0                      308.0   
1                  90002                  3839.5                      324.5   
2                  90003                  5583.5                      439.5   
3                  90004                  2642.0                      211.0   
4                  90005                  1537.0                      140.0   

   LBW_Rate  avg_traffic_pct  avg_diesel_pm  avg_cancer_risk  avg_resp_hazard  \
0  7.825203        79.588893       1.016899        45.873711         1.351807   
1  8.451621        84.287899       1.010436        44.793717         1.358213   
2  7.871407        91.147885       1.117200        45.927417         1.416787   
3  7.986374        87.255322       0.921646        47.512351         1.215364   
4  9.108653        83.062001   

In [22]:
# 1. Missing ZIP codes check
print("\n1. Missing ZIP CODES Check")
print(f"   Total ZIP codes in master data: {len(master_data)}")
print(f"   Unique ZIP codes: {master_data['ZIP_Code_of_Residence'].nunique()}")
print(f"   One row per ZIP: {len(master_data) == master_data['ZIP_Code_of_Residence'].nunique()}")

# 2. Missing values analysis
print("\n2. Missing Values Analysis")
missing_summary = pd.DataFrame({
    'Variable': master_data.columns,
    'Missing_Count': master_data.isnull().sum(),
    'Missing_Percent': (master_data.isnull().sum() / len(master_data) * 100).round(2),
    'Data_Type': master_data.dtypes
})
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)

if len(missing_summary) > 0:
    print(missing_summary.to_string(index=False))
    high_missing = missing_summary[missing_summary['Missing_Percent'] > 40]
    if len(high_missing) > 0:
        print(high_missing[['Variable', 'Missing_Percent']].to_string(index=False))
    else:
        print("   None")
else:
    print(" No missing values")

# 3. Checking data ranges
print("\n3. Data Ranges checking")

# Traffic exposure (should be 0-100)
if 'avg_traffic_pct' in master_data.columns:
    traffic_min = master_data['avg_traffic_pct'].min()
    traffic_max = master_data['avg_traffic_pct'].max()
    traffic_ok = (traffic_min >= 0) and (traffic_max <= 100)
    print(f"   Traffic exposure: {traffic_min:.2f} - {traffic_max:.2f}")
    print(f"   {'OK' if traffic_ok else 'WARNING'} Range is {'valid' if traffic_ok else 'INVALID (should be 0-100)'}")

# LBW rate (should be 0-100)
if 'LBW_Rate' in master_data.columns:
    lbw_min = master_data['LBW_Rate'].min()
    lbw_max = master_data['LBW_Rate'].max()
    lbw_ok = (lbw_min >= 0) and (lbw_max <= 100)
    print(f"   LBW Rate: {lbw_min:.2f}% - {lbw_max:.2f}%")
    print(f"   {'OK' if lbw_ok else 'WARNING'} Range is {'valid' if lbw_ok else 'INVALID (should be 0-100%)'}")
    print(f"   Note: Typical LBW rate in US is 8-9%")

# Poverty rate (should be 0-100 or count)
if 'poverty_rate' in master_data.columns:
    pov_min = master_data['poverty_rate'].min()
    pov_max = master_data['poverty_rate'].max()
    print(f"   Poverty rate: {pov_min:.2f} - {pov_max:.2f}")
    if pov_max > 100:
        print(f"   Larger than 100; not percentage")
    else:
        print(f"   Range looks like percentage")

# SVI score (should be 0-1)
if 'svi_score' in master_data.columns:
    svi_min = master_data['svi_score'].min()
    svi_max = master_data['svi_score'].max()
    svi_ok = (svi_min >= 0) and (svi_max <= 1)
    print(f"   SVI Score: {svi_min:.4f} - {svi_max:.4f}")
    print(f"   {'OK' if svi_ok else 'WARNING'} Range is {'valid' if svi_ok else 'INVALID (should be 0-1)'}")


1. Missing ZIP CODES Check
   Total ZIP codes in master data: 252
   Unique ZIP codes: 252
   One row per ZIP: True

2. Missing Values Analysis
               Variable  Missing_Count  Missing_Percent Data_Type
median_household_income             14             5.56   float64
     homeownership_rate              8             3.17   float64
 educational_attainment              4             1.59   float64
               LBW_Rate              3             1.19   float64
   None

3. Data Ranges checking
   Traffic exposure: 7.59 - 99.29
   OK Range is valid
   LBW Rate: 0.00% - 100.00%
   OK Range is valid
   Note: Typical LBW rate in US is 8-9%
   Poverty rate: 0.00 - 28613.00
   Larger than 100; not percentage
   SVI Score: -999.0000 - 0.9977


In [23]:
# Calculate poverty percentage
master_data['poverty_rate_pct'] = (master_data['poverty_rate'] / master_data['total_race_population']) * 100
    
# Handle division by zero or invalid values
master_data['poverty_rate_pct'] = master_data['poverty_rate_pct'].replace([np.inf, -np.inf], np.nan)
print(f"\nCreated poverty_rate_pct:")
print(f"  Range: {master_data['poverty_rate_pct'].min():.2f}% - {master_data['poverty_rate_pct'].max():.2f}%")
print(f"  Mean: {master_data['poverty_rate_pct'].mean():.2f}%")
print(f"  Missing: {master_data['poverty_rate_pct'].isna().sum()}")
# Drop the old count variable, keep the percentage
master_data = master_data.drop(columns=['poverty_rate'])
master_data = master_data.rename(columns={'poverty_rate_pct': 'poverty_rate'})
print("\nReplaced poverty_rate (count) with poverty_rate (percentage)")


Created poverty_rate_pct:
  Range: 0.00% - 85.94%
  Mean: 12.91%
  Missing: 4

Replaced poverty_rate (count) with poverty_rate (percentage)


In [24]:
# Remove rows where svi_score is -999
print(f"Before removing invalid SVI: {len(master_data)} rows")
master_data = master_data[master_data['svi_score'] != -999]
print(f"After removing invalid SVI: {len(master_data)} rows")
print(f"Removed: {len(master_data[master_data['svi_score'] == -999])} rows")

# Drop rows with missing LBW_Rate (outcome variable)
print(f"\nBefore removing missing LBW_Rate: {len(master_data)} rows")
master_data = master_data.dropna(subset=['LBW_Rate'])
print(f"After removing missing LBW_Rate: {len(master_data)} rows")

# Imputing missing values
vars_to_impute = ['median_household_income', 'homeownership_rate', 'educational_attainment']

for var in vars_to_impute:
    if var in master_data.columns:
        missing_before = master_data[var].isna().sum()
        
        if missing_before > 0:
            median_value = master_data[var].median()
            master_data[var] = master_data[var].fillna(median_value)
            missing_after = master_data[var].isna().sum()
            
            print(f"\n{var}:")
            print(f"  Missing before: {missing_before}")
            print(f"  Imputed with median: {median_value:.2f}")
            print(f"  Missing after: {missing_after}")
        else:
            print(f"\n{var}: No missing values")

# Impute missing poverty_rate with median
missing_poverty = master_data['poverty_rate'].isna().sum()
print(f"\nPoverty rate missing: {missing_poverty}")

if missing_poverty > 0:
    poverty_median = master_data['poverty_rate'].median()
    master_data['poverty_rate'] = master_data['poverty_rate'].fillna(poverty_median)
    print(f"Imputed with median: {poverty_median:.2f}")
    print(f"Missing after imputation: {master_data['poverty_rate'].isna().sum()}")


# Final Data quality check

print("Final data quality check")

print(f"\nFinal dataset:")
print(f"  Total ZIP codes: {len(master_data)}")
print(f"  Total variables: {len(master_data.columns)}")

# Check for remaining missing values
missing_summary = pd.DataFrame({
    'Variable': master_data.columns,
    'Missing_Count': master_data.isnull().sum(),
    'Missing_Percent': (master_data.isnull().sum() / len(master_data) * 100).round(2)
})
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)

print(f"\nRemaining missing values:")
if len(missing_summary) > 0:
    print(missing_summary.to_string(index=False))
else:
    print(" No missing values")

# Check for duplicates
duplicates = master_data.duplicated(subset=['ZIP_Code_of_Residence']).sum()
print(f"\nDuplicate ZIP codes: {duplicates}")
if duplicates == 0:
    print("  No duplicates")

Before removing invalid SVI: 252 rows
After removing invalid SVI: 244 rows
Removed: 0 rows

Before removing missing LBW_Rate: 244 rows
After removing missing LBW_Rate: 243 rows

median_household_income:
  Missing before: 5
  Imputed with median: 89110.00
  Missing after: 0

homeownership_rate: No missing values

educational_attainment: No missing values

Poverty rate missing: 0
Final data quality check

Final dataset:
  Total ZIP codes: 243
  Total variables: 27

Remaining missing values:
 No missing values

Duplicate ZIP codes: 0
  No duplicates


In [25]:
master_data.to_csv("master_data.csv", index=False)