In [33]:
import pandas as pd

In [34]:
import glob

path = r"C:\Users\Elias\OneDrive\Desktop\Birth data\*.csv"

csv_files = glob.glob(path)

dfs = [pd.read_csv(f) for f in csv_files]

merged_df = pd.concat(dfs, ignore_index=True)

print(merged_df)


      Type_of_Event  Year_of_Birth Residence_or_Place_of_Birth  \
0             Birth           2022          Place of Residence   
1             Birth           2022          Place of Residence   
2             Birth           2022          Place of Residence   
3             Birth           2022          Place of Residence   
4             Birth           2022          Place of Residence   
...             ...            ...                         ...   
11455         Birth           2018          Place of Residence   
11456         Birth           2018          Place of Residence   
11457         Birth           2018          Place of Residence   
11458         Birth           2018          Place of Residence   
11459         Birth           2018          Place of Residence   

       ZIP_Code_of_Residence   Birthweight_Grams Total_Births  \
0                      91101     500 - 999 grams            0   
1                      91101   1000 - 1499 grams          <11   
2           

In [35]:
#filtering rows with values = 'Unreliable/Unknown' and '<11' for Birthweight_Grams and Total_Births respectively
filtered = merged_df[
    (merged_df["Birthweight_Grams"].str.contains("Unreliable|Unknown", case=False, na=False))
    | (merged_df["Total_Births"] == "<11")
    | (merged_df["Total_Births"] == "0")
]

print(filtered)


      Type_of_Event  Year_of_Birth Residence_or_Place_of_Birth  \
0             Birth           2022          Place of Residence   
1             Birth           2022          Place of Residence   
2             Birth           2022          Place of Residence   
3             Birth           2022          Place of Residence   
8             Birth           2022          Place of Residence   
...             ...            ...                         ...   
11449         Birth           2018          Place of Residence   
11450         Birth           2018          Place of Residence   
11451         Birth           2018          Place of Residence   
11452         Birth           2018          Place of Residence   
11459         Birth           2018          Place of Residence   

       ZIP_Code_of_Residence   Birthweight_Grams Total_Births  \
0                      91101     500 - 999 grams            0   
1                      91101   1000 - 1499 grams          <11   
2           

In [36]:
# removing unreliable/unknown values
merged_df = merged_df[
    ~merged_df["Birthweight_Grams"].str.contains("Unreliable|Unknown", case=False, na=False)
]
# removing last_data_refresh column because it has no use
merged_df = merged_df.drop(columns=["Last_Data_Refresh"])

#replacing values <11 with 5.5
merged_df["Total_Births"] = merged_df["Total_Births"].replace("<11", "5.5")

# replacing values 0 with "0"
merged_df["Total_Births"] = merged_df["Total_Births"].replace("0", "0")

# converting from string to numeric
merged_df["Total_Births"] = pd.to_numeric(merged_df["Total_Births"], errors="coerce")



In [37]:
print("Remaining Unreliable:", 
      merged_df["Birthweight_Grams"].str.contains("Unreliable|Unknown", na=False).sum())

print("Remaining '<11' or '0':",
      merged_df["Total_Births"].isin(["<11", "0"]).sum())



Remaining Unreliable: 0
Remaining '<11' or '0': 0


In [38]:
# Total Births Per Zip Code for each year
total_births = (
    merged_df.groupby(["ZIP_Code_of_Residence", "Year_of_Birth"])["Total_Births"]
    .sum()
    .reset_index(name="Total_Births_All")
)


In [39]:
# Calculating Low-Birth Weight Count (<2500 g) for each Zip code
lbw_categories = [
    "500 - 999 grams",
    "1000 - 1499 grams", 
    "1500 - 1999 grams", 
    "2000 - 2499 grams"
]

lbw_df = merged_df[ merged_df["Birthweight_Grams"].isin(lbw_categories) ]

lbw_count = (
    lbw_df.groupby(["ZIP_Code_of_Residence", "Year_of_Birth"])["Total_Births"]
    .sum()
    .reset_index(name="LBW_Count")
)


In [40]:
# Calculating %LBW & High-risk classification
final = total_births.merge(lbw_count, on=["ZIP_Code_of_Residence", "Year_of_Birth"], how="left")
final["LBW_Count"] = final["LBW_Count"].fillna(0)

final["Pct_LBW"] = (final["LBW_Count"] / final["Total_Births_All"]) * 100
final["High_Risk"] = (final["Pct_LBW"] > 8).astype(int)


In [41]:
print("Rows:", merged_df.shape[0])
print("Columns:", merged_df.shape[1])
print("\nColumn Names:\n", merged_df.columns.tolist())

Rows: 10195
Columns: 6

Column Names:
 ['Type_of_Event', 'Year_of_Birth', 'Residence_or_Place_of_Birth', 'ZIP_Code_of_Residence', 'Birthweight_Grams', 'Total_Births']


In [42]:
# Save the detailed birth data
merged_df.to_csv("cleaned_birth_data_detailed.csv", index=False)

# Save the summary with LBW calculations
final.to_csv("cleaned_birth_data_summary.csv", index=False)