In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('preprocessing_data/train_data_final_feat_no_preprocessing.csv')
df_test = pd.read_csv('preprocessing_data/test_data_final_feat_no_preprocessing.csv')
print(df_train.columns)

Index(['PUMA', 'AGEP', 'CIT', 'ENG', 'LANX', 'MAR', 'MIG', 'SCHL', 'SEX',
       'WKHP', 'WKL', 'WRK', 'DIS', 'ESR', 'HICOV', 'LANP', 'MSP', 'NATIVITY',
       'OCCP', 'POBP', 'POVPIP', 'PRIVCOV', 'PUBCOV', 'RAC1P', 'year',
       'poverty_risk_score', 'CA_Region'],
      dtype='object')


In [3]:
def check_nulls(data):
    null_counts = data.isnull().sum()
    return null_counts[null_counts > 0]

print("Nulls in Training Set:\n", check_nulls(df_train))


Nulls in Training Set:
 ENG     846170
WKHP    516033
WRK     168637
LANP    846170
OCCP    376902
dtype: int64


In [4]:
unique_poverty_class= df_train['poverty_risk_score'].value_counts()
print(f"Unique values in 'poverty_class' column: {unique_poverty_class}")

Unique values in 'poverty_class' column: poverty_risk_score
0.0    1114746
1.0     196583
3.0      79445
2.0      78995
Name: count, dtype: int64


In [5]:

#!!!!!! SPECIFICALLY FOR BASELINE ONLY !!!!
# There are going to be feature engineering that the baseline model will require for logistic regression
# for ex Dropping features that are nearly identical due to the multicollinearity study
# keeping them can lead to mathematical noise in a logistic regression bc they are already so identical to eachother
# For the specific model training piece we will reimplement these!

def preprocess_acs_data_baseline(df, top_10_occp=None):
    #  Binarize simple columns
    mapping_12 = {1: 1, 2: 0}
    df['PRIVCOV'] = df['PRIVCOV'].map(mapping_12)
    df['PUBCOV']  = df['PUBCOV'].map(mapping_12)
    df['HICOV']   = df['HICOV'].map({1: 0, 2: 1}) # Usually 2 is 'No insurance'
    df['DIS']     = df['DIS'].map(mapping_12)
    df['SEX']     = df['SEX'].map(mapping_12)
    df['MAR']     = (df['MAR'] == 1).astype(int)

    #  missing values
    df['WKHP'] = df['WKHP'].fillna(0)

    # Feature Eng
    df['CIT'] = (df['CIT'] < 5).astype(int)
    df['ESR'] = df['ESR'].isin([1, 2]).astype(int)
    # 1. LANX - Binary: Speaks other language
    df['LANX'] = (df['LANX'] == 1.0).astype(int)
    # 2. MIG - Binary: Moved in the last year
    df['MIG'] = df['MIG'].isin([2, 3]).astype(int)
    df['Born_in_CA'] = (df['POBP'] == 6).astype(int)
    df_train['RAC1P'] = df_train['RAC1P'].astype(str)

    def recode_education(val):
        if pd.isna(val) or val <= 15: return 0 # No HS Diploma
        if val <= 17: return 1 # HS Diploma / GED
        if val <= 20: return 2 # Some College / Associate
        return 3 # Bachelor's or Higher
    df['SCHL_Tier'] = df['SCHL'].apply(recode_education)

    #  Occupation Grouping (Use training top 10 for both)
    df['OCCP'] = df['OCCP'].fillna('NILF')
    if top_10_occp is not None:
        df['OCCP_grouped'] = df['OCCP'].apply(lambda x: x if x in top_10_occp or x == 'NILF' else 'Other')

    return df

In [6]:
# Get top 10 OCCP from TRAIN only to avoid data leakage
top_10 = df_train['OCCP'].value_counts().nlargest(10).index

#  preprocessing
df_train = preprocess_acs_data_baseline(df_train, top_10_occp=top_10)
df_test = preprocess_acs_data_baseline(df_test, top_10_occp=top_10)

# Identify features
final_features = ['AGEP', 'WKHP', 'SEX', 'DIS', 'CIT', 'Born_in_CA',
                  'SCHL_Tier', 'OCCP_grouped', 'CA_Region', 'RAC1P', 'year']

# Separate Targets
y_train_full = df_train['poverty_risk_score'].astype(int)
y_test = df_test['poverty_risk_score'].astype(int)

In [7]:
df_stable = df_train[df_train['poverty_risk_score'] == 0]
df_at_risk = df_train[df_train['poverty_risk_score'] > 0]
# Downsampling the 'Stable' group to match the total count of 'At Risk'
df_stable_downsampled = resample(df_stable,
                                 replace=False,
                                 n_samples=len(df_at_risk),
                                 random_state=42)

# Combine back
train_balanced = pd.concat([df_stable_downsampled, df_at_risk])

print("New Class Distribution:")
print(train_balanced['poverty_risk_score'].value_counts())

New Class Distribution:
poverty_risk_score
0.0    355023
1.0    196583
3.0     79445
2.0     78995
Name: count, dtype: int64


In [8]:
# Save preprocessed data for model notebook
train_balanced.to_csv('preprocessing_data/train_balanced_baseline.csv', index=False)
df_test.to_csv('preprocessing_data/test_preprocessed_baseline.csv', index=False)
print(f'Saved train_balanced: {train_balanced.shape}')
print(f'Saved df_test: {df_test.shape}')

Saved train_balanced: (710046, 30)
Saved df_test: (304368, 30)
