# Random Forest (Expanded Features) — Preprocessing

In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


## 1) Load data

In [2]:
train_path = "preprocessing_data/train_data_final_feat_no_preprocessing.csv"
test_path  = "preprocessing_data/test_data_final_feat_no_preprocessing.csv"

df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

print("Train shape:", df_train.shape)
print("Test shape :", df_test.shape)
print("\nTrain target distribution:")
print(df_train["poverty_risk_score"].value_counts(dropna=False))


Train shape: (1469769, 27)
Test shape : (304368, 27)

Train target distribution:
poverty_risk_score
0.0    1114746
1.0     196583
3.0      79445
2.0      78995
Name: count, dtype: int64


In [3]:
#check for nulls
cols = ["WKHP","WKL","WRK","ENG","LANX","MIG","PUMA","POBP","OCCP","SCHL","ESR","CIT","MAR","MSP","NATIVITY","RAC1P","year"]
(df_train[cols].isna().mean().sort_values(ascending=False) * 100).round(2)

ENG         57.57
WKHP        35.11
OCCP        25.64
WRK         11.47
LANX         0.00
MIG          0.00
PUMA         0.00
POBP         0.00
WKL          0.00
SCHL         0.00
ESR          0.00
CIT          0.00
MAR          0.00
MSP          0.00
NATIVITY     0.00
RAC1P        0.00
year         0.00
dtype: float64

In [4]:
#check value counts for the english speaking variable
df_train["ENG"].value_counts(dropna=False)

ENG
NaN    846170
1.0    347613
2.0    132583
3.0     99738
4.0     43665
Name: count, dtype: int64

In [5]:
def fix_acs_nas(df):
    df = df.copy()

    df["WKHP"] = df["WKHP"].replace({0: np.nan})
    df["WKL"]  = df["WKL"].replace({0: np.nan})
    df["WRK"]  = df["WRK"].map({1:1, 2:0, 0:np.nan})
    df["LANX"] = df["LANX"].map({1:1, 2:0, 0:np.nan})
    df.loc[(df["ENG"].isna()) & (df["LANX"] == 0), "ENG"] = 0

    #df["ENG"]  = df["ENG"].replace({0: np.nan}) -- see below note and code on ENG variable.
    df["MIG"]  = df["MIG"].replace({0: np.nan})
    df["ESR"] = df["ESR"].replace({0: np.nan})
    # MAR: not applicable under 15
    df.loc[df["AGEP"] < 15, "MAR"] = np.nan
    # MSP: not applicable under 15
    df.loc[df["AGEP"] < 15, "MSP"] = np.nan


    #handling nulls

    for c in ["WKHP", "WKL", "ENG", "LANX", "MIG", "WRK"]:
        if c in df.columns:
            df[f"{c}_missing"] = df[c].isna().astype(int)

    return df


In [6]:

df_fixed = fix_acs_nas(df_train)

df_fixed["ENG_missing"].value_counts(dropna=False)
df_fixed["ENG"].isna().mean()
df_fixed["ENG"].value_counts(dropna=False).head(10)

'''ENG = 0 → speaks only English
ENG = 1–4 → proficiency
ENG_missing now represents true unexpected missingness'''


'ENG = 0 → speaks only English\nENG = 1–4 → proficiency\nENG_missing now represents true unexpected missingness'

In [7]:
# fix ACS with mapped values for variables
df_train = fix_acs_nas(df_train)
df_test  = fix_acs_nas(df_test)

print("Train shape:", df_train.shape)
print("Test shape :", df_test.shape)
print("\nTrain target distribution:")
print(df_train["poverty_risk_score"].value_counts(dropna=False))


Train shape: (1469769, 33)
Test shape : (304368, 33)

Train target distribution:
poverty_risk_score
0.0    1114746
1.0     196583
3.0      79445
2.0      78995
Name: count, dtype: int64


## 2) Preprocessing function (expanded features)

In [8]:
def preprocess_acs_data_rf(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess ACS-like features for Random Forest.
    - Do NOT fill binary columns with 0 (0 can be meaningful).
    - Map {1,2} survey binaries -> {1,0} while leaving NaN as NaN.
    - Keep full OCCP (no top-10 grouping).
    - Keep correlated features (RF doesn't require multicollinearity pruning).
    """
    df = df.copy()

    mapping_12 = {1: 1, 2: 0}  # 1=yes, 2=no

    #Insurance, disability, sex
    for col in ["PRIVCOV", "PUBCOV", "DIS", "SEX"]:
        if col in df.columns:
            df[col] = df[col].map(mapping_12)

    #HICOV:{1:0, 2:1} (2 indicates "no insuranc")
    if "HICOV" in df.columns:
        df["HICOV"] = df["HICOV"].map({1: 0, 2: 1})

    #MAR:only explicit 1 is "married"; preserve NaN
    if "MAR" in df.columns:
        df["MAR"] = np.where(df["MAR"].isna(), np.nan, (df["MAR"] == 1).astype(int))

    #citizenship binary flag/preserve NaN
    if "CIT" in df.columns:
        df["CIT"] = np.where(df["CIT"].isna(), np.nan, (df["CIT"] < 5).astype(int))

    #employment status flag from ESR
    if "ESR" in df.columns:
        df["ESR_emp"] = np.where(df["ESR"].isna(), np.nan, df["ESR"].isin([1, 2]).astype(int))

    # worked last week (1=yes,2=no)
    if "WRK" in df.columns:
        df["WRK"] = df["WRK"].map(mapping_12)

    #MIG recent move flag
    if "MIG" in df.columns:
        df["MIG_recent"] = np.where(df["MIG"].isna(), np.nan, df["MIG"].isin([2, 3]).astype(int))

    #NATIVITY: 1=native,2=foreign born
    if "NATIVITY" in df.columns:
        df["ForeignBorn"] = df["NATIVITY"].map({1: 0, 2: 1})

    #place ofbirth: keep POBP/engineer Born_in_CA
    if "POBP" in df.columns:
        df["Born_in_CA"] = np.where(df["POBP"].isna(), np.nan, (df["POBP"] == 6).astype(int))

    #Language
    if "LANX" in df.columns:
        df["LANX"] = np.where(df["LANX"].isna(), np.nan, (df["LANX"] == 1).astype(int))

    #Educationtiers
    if "SCHL" in df.columns:
        def recode_education(val):
            if pd.isna(val) or val == 0:
                return np.nan   # 0 = N/A (<3 years old)
            if val <= 15:
                return 0
            if val <= 17:
                return 1
            if val <= 20:
                return 2
            return 3

        df["SCHL_Tier"] = df["SCHL"].apply(recode_education)

    # Cast selected categoricals to string (later coerced to object+None for sklearn)
    for cat_col in ["OCCP", "CA_Region", "RAC1P", "year", "MSP", "WKL", "ENG", "LANP", "PUMA", "POBP"]:
        if cat_col in df.columns:
            df[cat_col] = df[cat_col].astype("string")

    #preserve leading zeros for categoricals
    if "PUMA" in df.columns:
        df["PUMA"] = df["PUMA"].str.zfill(5)

    if "OCCP" in df.columns:
        mask = df["OCCP"].str.fullmatch(r"\d+")
        df.loc[mask, "OCCP"] = df.loc[mask, "OCCP"].str.zfill(4)

    if "POBP" in df.columns:
        df["POBP"] = df["POBP"].str.zfill(3)

    return df


In [9]:
df_train_pp = preprocess_acs_data_rf(df_train)
df_test_pp  = preprocess_acs_data_rf(df_test)

print("Done preprocessing.")


Done preprocessing.


In [10]:
#CHECK #1 (corrected for string ENG)
sub = df_train_pp.loc[df_train_pp["LANX"] == 0, "ENG"].astype("string")

print("ENG dtype:", sub.dtype)
print("Share ENG == '0.0' among LANX == 0:", (sub == "0.0").mean())
print("\nValue counts:")
print(sub.value_counts(dropna=False).head(10))


ENG dtype: string
Share ENG == '0.0' among LANX == 0: 1.0

Value counts:
ENG
0.0    846170
Name: count, dtype: Int64


## 3) Feature set (expanded)

In [11]:
features = [
    # numeric-ish
    "AGEP", "WKHP", "SCHL_Tier",

    # binaries
    "SEX", "DIS", "CIT", "MAR", "WRK", "Born_in_CA", "LANX", "ForeignBorn", "ESR_emp", "MIG_recent",

    # categoricals
    "MSP", "WKL", "ENG", "LANP",
    "OCCP", "RAC1P", "CA_Region",
    "PUMA", "POBP",
    "year"
]

target_col = "poverty_risk_score"

missing = [c for c in features + [target_col] if c not in df_train_pp.columns]
if missing:
    print("WARNING missing columns:", missing)

# split for ColumnTransformer
num_features = ["AGEP", "WKHP", "SCHL_Tier"]
bin_features = ["SEX","DIS","CIT","MAR","WRK","Born_in_CA","LANX","ForeignBorn","ESR_emp","MIG_recent"]
cat_features = ["MSP","WKL","ENG","LANP","OCCP","RAC1P","CA_Region","PUMA","POBP","year"]

X_train = df_train_pp[features].copy()
X_test  = df_test_pp[features].copy()
y_train = df_train_pp[target_col].astype(int)
y_test  = df_test_pp[target_col].astype(int)

print("X_train raw:", X_train.shape, "X_test raw:", X_test.shape)


X_train raw: (1469769, 23) X_test raw: (304368, 23)


## 4) Fix pandas nullable missing + build sparse preprocessor

In [12]:
# Make numerics real numeric and categoricals plain objects with None for missing.

for c in num_features + bin_features:
    X_train[c] = pd.to_numeric(X_train[c], errors="coerce")
    X_test[c]  = pd.to_numeric(X_test[c], errors="coerce")

for c in cat_features:
    X_train[c] = X_train[c].astype("object")
    X_test[c]  = X_test[c].astype("object")
    X_train[c] = X_train[c].where(pd.notna(X_train[c]), None)
    X_test[c]  = X_test[c].where(pd.notna(X_test[c]), None)

# Bucket rare categories into an 'infrequent' bin to keep feature space manageable.
MIN_FREQ = 25  # can alter this value for runtime: 5/25/50 

preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_features),
        ("bin", SimpleImputer(strategy="most_frequent"), bin_features),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(
                handle_unknown="ignore",
                sparse_output=True,
                min_frequency=MIN_FREQ
            ))
        ]), cat_features),
    ],
    remainder="drop"
)

X_train_sparse = preprocess.fit_transform(X_train)
X_test_sparse  = preprocess.transform(X_test)

print("X_train_sparse:", X_train_sparse.shape, type(X_train_sparse))
print("X_test_sparse :", X_test_sparse.shape, type(X_test_sparse))


X_train_sparse: (1469769, 1266) <class 'scipy.sparse._csr.csr_matrix'>
X_test_sparse : (304368, 1266) <class 'scipy.sparse._csr.csr_matrix'>


## 5) Save Sparse matrices/labels/preprocesser

In [14]:
import joblib
from scipy import sparse
from pathlib import Path

# Ensure directory exists
output_dir = Path("preprocessing_data")
output_dir.mkdir(exist_ok=True)

# Save sparse matrices
sparse.save_npz(output_dir / "X_train_sparse_rf.npz", X_train_sparse)
sparse.save_npz(output_dir / "X_test_sparse_rf.npz", X_test_sparse)

# Save labels
y_train.to_csv(output_dir / "y_train_rf.csv", index=False)
y_test.to_csv(output_dir / "y_test_rf.csv", index=False)

# Save fitted preprocessor
joblib.dump(preprocess, output_dir / "rf_preprocessor.joblib")

print("Preprocessing complete. Files saved to preprocessing_data/")

Preprocessing complete. Files saved to preprocessing_data/
