<h2>Generate Natality 7yr Test Data</h2>

This notebook generates the first dataset we used for the initial EDA, the DAG-generation notebooks, and the Variable Selection pipeline. The final dataset combines the most recent 7 years of natality data and does so by finding the set of columns that are common to all of them (intersection).

In [None]:
from pathlib import Path
import os
import re
import pandas as pd

BASE_DIR = Path().resolve().parent

In [None]:
regex_patterns = [r'^mm_', r'no_mmorb', #MaternalMorbidity factors, maternal morbidity
                  r'^dob_', r'bfacil$',r'attend$', #date of birth, type of facility of birth, attendant at birth
                  r'^rf_', r'^ip_','^ld_', r'ab_',r'ca_','me_', #RiskFactors, InfectionPresent, LaborandDelivery, AbnormalConditions, congenital anomalies, method of delivery
                  r'mager$',r'mracehisp', r'^mar_p', r'dmar', r'meduc', #mother's demographichs
                  r'^cig_',r'wtgain$', r'bmi$', r'pwgt_r', 'dwgt_r', #mother's health factors
                  r'fagecomb',r'fracehisp',r'feduc', #father's demographics
                  r'dplural', r'sex$',r'combgest', r'dbwt', #baby health factors
                  r'^prior',r'illb_r$', r'ilop_r$',r'ilp_r', #prior births living, dead, and terminated timeline
                  r'previs$', r'precare$', #pregnancy care
                  r'apgar' #apgar scores, can be either 5 or 10mins
                  r'wic',r'pay$',#funding
                  ]

combined_regex = '|'.join(regex_patterns)

In [None]:
common_columns = []
unique_colums = []
for i , file in enumerate(BASE_DIR.glob("natality_data/*.csv")):

  name = os.path.basename(file)
  year = int(re.search(r"(\d{4})", name).group(1))
  if year > 2000 and year <2018:
    print(f'Loading data from {file}')
    df = pd.read_csv(file, nrows=1)
    col_drop = df.filter(regex='^f_').columns
    df = df.drop(columns=col_drop)
    df_cols = df.columns
    if i == 0:
      common_columns = df_cols
      unique_colums = df_cols
    else:
      common_cols = list(set(df_cols).intersection(set(common_columns)))
      common_columns = common_cols
      unique_cols = list(set(df_cols) - set(common_cols))
      unique_colums = unique_cols
    print(f"Common columns {common_columns}")
    print(f"Unshared columns {unique_colums}")

In [None]:
common_columns = []
unique_colums = []
for i , file in enumerate(BASE_DIR.glob("natality_data/*.csv")):

  name = os.path.basename(file)
  year = int(re.search(r"(\d{4})", name).group(1))
  if year > 2016:
    print(f'Loading data from {file}')
    df = pd.read_csv(file, nrows=1)
    col_drop = df.filter(regex='^f_').columns
    df = df.drop(columns=col_drop)
    df_cols = df.columns
    if i == 0:
      common_columns = df_cols
      unique_colums = df_cols
    else:
      common_cols = list(set(df_cols).intersection(set(common_columns)))
      common_columns = common_cols
      unique_cols = list(set(df_cols) - set(common_cols))
      unique_colums = unique_cols
    print(f"Common columns {common_columns}")
    print(f"Unshared columns {unique_colums}")


In [None]:
df_2024 = pd.read_csv(BASE_DIR / "data_main" / "raw_natality_data" /"natality2024us.csv", nrows=1)
df_2024_cols = df_2024.columns.to_list()

In [None]:
for file in BASE_DIR.glob("natality_data/*.csv"):
  name = os.path.basename(file)
  year = int(re.search(r"(\d{4})", name).group(1))
  if year > 2017:
    print(f'Loading data from {file}')
    for i, chunk in enumerate(pd.read_csv(file, chunksize=10000, low_memory=False)):
      chunck = chunk.reindex(columns=df_2024_cols)
      col_drop = chunk.filter(regex='^f_').columns
      chunk = chunk.drop(columns=col_drop)
      filter_cols = chunk.columns[chunk.columns.str.contains(combined_regex)]
      processed_chunk = chunk.filter(filter_cols)
      mapping = {'Y':1, 'N':0,'U':2,'X':3, "P":2}
      for col in processed_chunk.columns:
        processed_chunk[col] = processed_chunk[col].replace(mapping)

      out_file_path = BASE_DIR / "data_main" / "natality_7yr_test_data.csv"

      file_exists = out_file_path.exists()

      with open(out_file_path, 'a', encoding='utf-8') as f:
          processed_chunk.to_csv(f, header=not file_exists, index=False)
      if i == 2:
          break
      if i % 100 == 0:
          print(f'Processed {i*10000} rows')

In [None]:
chunks = pd.read_csv(BASE_DIR / "data_main" / "natality_7yr_test_data.csv", chunksize=1000)
df = pd.concat(chunks)

df

In [None]:
test_df = df.copy()

for col in test_df.columns:
    if test_df[col].dtype == 'object':
      string_mask = test_df[col].apply(lambda x: isinstance(x, str))
      string_values_in_mixed_col = test_df[col][string_mask]

      print(f"\nString values in '{col}':")
      print(string_values_in_mixed_col)

In [None]:
null_counts = df.isnull().sum()
columns_with_many_nulls = null_counts[null_counts > 2000]
column_names = columns_with_many_nulls.index.tolist()
df[column_names]