In [41]:
import pandas as pd
import numpy as np

In [42]:
cleaned_birth_data_summary = pd.read_csv('cleaned_birth_data_summary.csv')
final_ZIP_level_EJ_weighted = pd.read_csv('final_ZIP_level_EJ_weighted.csv')
merged_svi_acs_data = pd.read_csv('merged_svi_acs_data.csv')

In [43]:
# Renaming all ZIP  code columns to the same name
final_ZIP_level_EJ_weighted = final_ZIP_level_EJ_weighted.rename(columns={'ZIP_CODE': 'ZIP_Code_of_Residence'})
merged_svi_acs_data = merged_svi_acs_data.rename(columns={'FIPS': 'ZIP_Code_of_Residence'})

# Merging
master_data = cleaned_birth_data_summary.merge(final_ZIP_level_EJ_weighted, on='ZIP_Code_of_Residence', how='inner')\
                       .merge(merged_svi_acs_data, on='ZIP_Code_of_Residence', how='inner')

In [44]:
# Check ZIP code overlap
birth_zips = set(cleaned_birth_data_summary['ZIP_Code_of_Residence'])
ejscreen_zips = set(final_ZIP_level_EJ_weighted['ZIP_Code_of_Residence'])
common_zips = birth_zips.intersection(ejscreen_zips)

print(f"Birth ZIPs: {len(birth_zips)}")
print(f"EJScreen ZIPs: {len(ejscreen_zips)}")
print(f"Common ZIPs: {len(common_zips)}")

# Check SVI/ACS ZIP coverage
birth_zips = set(cleaned_birth_data_summary['ZIP_Code_of_Residence'])
svi_acs_zips = set(merged_svi_acs_data['ZIP_Code_of_Residence'])  
common_with_svi = birth_zips.intersection(svi_acs_zips)

print(f"Birth data ZIPs: {len(birth_zips)}")
print(f"SVI/ACS data ZIPs: {len(svi_acs_zips)}")
print(f"Common ZIPs: {len(common_with_svi)}")

Birth ZIPs: 234
EJScreen ZIPs: 482
Common ZIPs: 234
Birth data ZIPs: 234
SVI/ACS data ZIPs: 1803
Common ZIPs: 234


In [45]:
print("="*80)
print("MASTER DATA OVERVIEW")
print("="*80)

# 1. Basic Information
print("\n1. BASIC INFORMATION")
print(f"Shape: {master_data.shape} (rows, columns)")

# 2. First few rows
print("\n2. FIRST 5 ROWS")
print(master_data.head())

# 3. Column names and data types
print("\n3. COLUMN NAMES AND DATA TYPES")
print(master_data.dtypes)

# 4. Missing values analysis
print("\n4. MISSING VALUES ANALYSIS")
missing_data = pd.DataFrame({
    'Column': master_data.columns,
    'Missing_Count': master_data.isnull().sum(),
    'Missing_Percentage': (master_data.isnull().sum() / len(master_data) * 100).round(2),
    'Data_Type': master_data.dtypes
})
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
print(missing_data.to_string(index=False))

if len(missing_data) == 0:
    print("✓ No missing values found!")

# 5. Duplicate rows check
print("\n5. DUPLICATE ROWS CHECK")
duplicate_count = master_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
if duplicate_count > 0:
    print("Warning: Duplicates found!")
else:
    print("✓ No duplicates found!")

# 6. Check for any completely empty columns
print("\n8. COMPLETELY EMPTY COLUMNS CHECK")
empty_cols = [col for col in master_data.columns if master_data[col].isnull().all()]
if empty_cols:
    print(f"Warning: These columns are completely empty: {empty_cols}")
else:
    print("✓ No completely empty columns!")

# 7. Data quality summary
print("\n9. DATA QUALITY SUMMARY")
print(f"Total columns: {len(master_data.columns)}")
print(f"Columns with missing values: {len(missing_data)}")
print(f"Complete rows (no missing values): {master_data.notna().all(axis=1).sum()}")
print(f"Rows with at least one missing value: {master_data.isnull().any(axis=1).sum()}")

# 8. Merge success check
print("\n10. MERGE SUCCESS CHECK")
print(f"Original birth data rows: {len(cleaned_birth_data_summary)}")
print(f"Original EJ data rows: {len(cleaned_ZIP_level_EJ_Weighted)}")
print(f"Original SVI/ACS data rows: {len(merged_svi_acs_data)}")
print(f"Final master data rows: {len(master_data)}")
print(f"Rows lost in merge: {len(cleaned_birth_data_summary) - len(master_data)}")


MASTER DATA OVERVIEW

1. BASIC INFORMATION
Shape: (1170, 29) (rows, columns)

2. FIRST 5 ROWS
   ZIP_Code_of_Residence  Year_of_Birth  Total_Births_All  LBW_Count  \
0                  90001           2018             913.0       77.0   
1                  90001           2019             821.0       71.0   
2                  90001           2020             762.5       48.5   
3                  90001           2021             729.5       57.5   
4                  90001           2022             710.0       54.0   

    LBW_Pct  High_Risk  avg_traffic_pct  avg_diesel_pm  avg_cancer_risk  \
0  8.433735          1        73.045085       0.929436         44.82829   
1  8.647990          1        73.045085       0.929436         44.82829   
2  6.360656          0        73.045085       0.929436         44.82829   
3  7.882111          0        73.045085       0.929436         44.82829   
4  7.605634          0        73.045085       0.929436         44.82829   

   avg_resp_hazard  ..

In [46]:
master_data.to_csv("master_data.csv", index=False)