In [25]:
import os
import glob
import pandas as pd

csv_folder = r"C:\Users\Elias\Final Project\Birth_Data_files"

# finds all CSV files
csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))
print("Found files:", csv_files)


dfs = [pd.read_csv(file) for file in csv_files]

# Merge into one DataFrame if any files were found
if dfs:
    merged_df = pd.concat(dfs, ignore_index=True)
    print("Merged shape:", merged_df.shape)
    print(merged_df.head())
else:
    print("No CSV files found in", csv_folder)

Found files: ['C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (10).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (11).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (1a).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (2a).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (3a).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (4a).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (5a).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (6a).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (7a).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (8a).csv', 'C:\\Users\\Elias\\Final Project\\Birth_Data_files\\Cal-ViDa_Birth_11122025 (9a).csv', 'C:\\Users\\Elias\\Final Proj

In [26]:
#filtering rows with values = 'Unreliable/Unknown' and '<11' for Birthweight_Grams and Total_Births respectively
filtered = merged_df[
    (merged_df["Birthweight_Grams"].str.contains("Unreliable|Unknown", case=False, na=False))
    | (merged_df["Total_Births"] == "<11")
    | (merged_df["Total_Births"] == "0")
]

print(filtered)
# dropping unnecessary columns
merged_df = merged_df.drop(columns=["Type_of_Event", "Residence_or_Place_of_Birth"])

      Type_of_Event  Year_of_Birth Residence_or_Place_of_Birth  \
0             Birth           2022          Place of Residence   
1             Birth           2022          Place of Residence   
2             Birth           2022          Place of Residence   
3             Birth           2022          Place of Residence   
8             Birth           2022          Place of Residence   
...             ...            ...                         ...   
11449         Birth           2018          Place of Residence   
11450         Birth           2018          Place of Residence   
11451         Birth           2018          Place of Residence   
11452         Birth           2018          Place of Residence   
11459         Birth           2018          Place of Residence   

       ZIP_Code_of_Residence   Birthweight_Grams Total_Births  \
0                      91101     500 - 999 grams            0   
1                      91101   1000 - 1499 grams          <11   
2           

In [27]:
# removing unreliable/unknown values
merged_df = merged_df[
    ~merged_df["Birthweight_Grams"].str.contains("Unreliable|Unknown", case=False, na=False)
]
# removing last_data_refresh column because it has no use
merged_df = merged_df.drop(columns=["Last_Data_Refresh"])

#replacing values <11 with 5.5
merged_df["Total_Births"] = merged_df["Total_Births"].replace("<11", "5.5")

# replacing values 0 with "0"
merged_df["Total_Births"] = merged_df["Total_Births"].replace("0", "0")

# converting from string to numeric
merged_df["Total_Births"] = pd.to_numeric(merged_df["Total_Births"], errors="coerce")

In [28]:
print("Remaining Unreliable:", 
      merged_df["Birthweight_Grams"].str.contains("Unreliable|Unknown", na=False).sum())

print("Remaining '<11' or '0':",
      merged_df["Total_Births"].isin(["<11", "0"]).sum())

Remaining Unreliable: 0
Remaining '<11' or '0': 0


In [29]:
# Total Births Per Zip Code for each year
total_births = (
    merged_df.groupby(["ZIP_Code_of_Residence", "Year_of_Birth"])["Total_Births"]
    .sum()
    .reset_index(name="Annual_Births_By_Zip")
)


In [30]:
# Calculating Low-Birth Weight Count (<2500 g) for each Zip code
lbw_categories = ["500 - 999 grams","1000 - 1499 grams", "1500 - 1999 grams", "2000 - 2499 grams"]

lbw_df = merged_df[ merged_df["Birthweight_Grams"].isin(lbw_categories) ]

lbw_count = (lbw_df.groupby(["ZIP_Code_of_Residence", "Year_of_Birth"])["Total_Births"].sum().reset_index(name="LBW_Count"))


In [31]:
# Calculating LBW rate
final = total_births.merge(lbw_count, on=["ZIP_Code_of_Residence", "Year_of_Birth"], how="left")
final["Annual_Births_By_Zip"] = pd.to_numeric(final["Annual_Births_By_Zip"], errors="coerce")
final["LBW_rate"] = (final["LBW_Count"] / final["Annual_Births_By_Zip"]) * 100

In [32]:
# Aggregate to one row per zip code (sum across 2018-2022)
aggregated_birth_data = final.groupby('ZIP_Code_of_Residence').agg({
    'Annual_Births_By_Zip': 'sum',      # Total births across all years
    'LBW_Count': 'sum'                   # Total LBW births across all years
}).reset_index()

# Rename for clarity
aggregated_birth_data = aggregated_birth_data.rename(columns={
    'Annual_Births_By_Zip': 'Total_Births_2018_2022',
    'LBW_Count': 'Total_LBW_Count_2018_2022'
})

# Recalculate LBW rate
aggregated_birth_data['LBW_Rate'] = (
    aggregated_birth_data['Total_LBW_Count_2018_2022'] / 
    aggregated_birth_data['Total_Births_2018_2022']
) * 100

print(f"Aggregated to {len(aggregated_birth_data)} ZIP codes")
print(aggregated_birth_data.head(10))

Aggregated to 267 ZIP codes
   ZIP_Code_of_Residence  Total_Births_2018_2022  Total_LBW_Count_2018_2022  \
0                  90001                  3936.0                      308.0   
1                  90002                  3839.5                      324.5   
2                  90003                  5583.5                      439.5   
3                  90004                  2642.0                      211.0   
4                  90005                  1537.0                      140.0   
5                  90006                  2755.5                      211.5   
6                  90007                  1275.5                      123.5   
7                  90008                  1625.5                      168.5   
8                  90009                    11.0                        0.0   
9                  90010                   159.5                       27.5   

    LBW_Rate  
0   7.825203  
1   8.451621  
2   7.871407  
3   7.986374  
4   9.108653  
5   7.675558

In [33]:
print("Rows:", merged_df.shape[0])
print("Columns:", merged_df.shape[1])
print("\nColumn Names:\n", merged_df.columns.tolist())

Rows: 10195
Columns: 4

Column Names:
 ['Year_of_Birth', 'ZIP_Code_of_Residence', 'Birthweight_Grams', 'Total_Births']


In [34]:
# Save yearly data for reference
final.to_csv("cleaned_birth_data_summary.csv", index=False)

# Save aggregated data for modelling
aggregated_birth_data.to_csv("cleaned_birth_data_aggregated.csv", index=False)