In [None]:
'''
-- correcting class imablance in RF --class_weight tuning "balance_subsample"?
-- explore features outside of LR model
-- variable mapping
-- verify null handling for each
-- RC related child variable might be all NaN bc it is N/A to our people. 
'''

# Random Forest Model (ACS Poverty Risk) — Preprocessing + Full OCCP

This notebook mirrors some of the baseline logistic-regression preprocessing, but adapts it for a tree model:

- **No scaling**
- **No top-10 OCCP restriction** (uses all occupation codes via one-hot encoding)
- **Avoids treating valid `0` values as missing**
- Uses **imputation** to handle true missing values
- Trains a **RandomForestClassifier** with `class_weight="balanced"`


In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt


## 1) Load train/test data

In [17]:
#paths taken from baseline model
train_path = "preprocessing_data/train_data_final_feat_no_preprocessing.csv"
test_path  = "preprocessing_data/test_data_final_feat_no_preprocessing.csv"

df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

print("Train shape:", df_train.shape)
print("Test shape :", df_test.shape)
print("\nTarget distribution (train):")
print(df_train["poverty_risk_score"].value_counts(dropna=False))


Train shape: (1857626, 26)
Test shape : (378571, 103)

Target distribution (train):
poverty_risk_score
0.0    1375161
1.0     268696
2.0     109241
3.0     104528
Name: count, dtype: int64


## 2) Preprocessing for Random Forest

In [18]:
def preprocess_acs_data_rf(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess ACS-like features for Random Forest.

    Key choices:
    - Do NOT fill binary columns with 0 (0 can be meaningful).
    - Map {1,2} survey binaries -> {1,0} while leaving NaN as NaN.
    - Keep full OCCP (no top-10 grouping).
    - Keep correlated features (RF doesn't require multicollinearity pruning).
    """
    df = df.copy()

    # --- Binary recoding (leave NaN as NaN) ---
    mapping_12 = {1: 1, 2: 0}  # 1=yes, 2=no

    # insurance + disability + sex
    for col in ["PRIVCOV", "PUBCOV", "DIS", "SEX"]:
        if col in df.columns:
            df[col] = df[col].map(mapping_12)

    # HICOV: {1:0, 2:1} where 2 indicates "No insurance"
    if "HICOV" in df.columns:
        df["HICOV"] = df["HICOV"].map({1: 0, 2: 1})

    # MAR: treat only explicit "1" as married; keep NaNs
    if "MAR" in df.columns:
        df["MAR"] = np.where(df["MAR"].isna(), np.nan, (df["MAR"] == 1).astype(int))

    # --- Feature engineering / recodes ---
    if "CIT" in df.columns:
        df["CIT"] = np.where(df["CIT"].isna(), np.nan, (df["CIT"] < 5).astype(int))

    # Employment indicators (keep as binaries rather than collapsing everything)
    if "ESR" in df.columns:
        df["ESR_emp"] = np.where(df["ESR"].isna(), np.nan, df["ESR"].isin([1, 2]).astype(int))

    if "WRK" in df.columns:
        # ACS often uses 1=yes 2=no (confirm in your codebook)
        df["WRK"] = df["WRK"].map(mapping_12)

    if "WKL" in df.columns:
        # WKL is typically categorical/ordinal ("when last worked") – keep as category later
        # Don't recode here unless you have a specific mapping
        pass

    if "MIG" in df.columns:
        df["MIG_recent"] = np.where(df["MIG"].isna(), np.nan, df["MIG"].isin([2, 3]).astype(int))

    # Nativity: many ACS extracts use 1=native, 2=foreign born (confirm); map to binary foreign-born flag
    if "NATIVITY" in df.columns:
        df["ForeignBorn"] = df["NATIVITY"].map({1: 0, 2: 1})

    # Place of birth: keep original and engineer Born_in_CA
    if "POBP" in df.columns:
        df["Born_in_CA"] = np.where(df["POBP"].isna(), np.nan, (df["POBP"] == 6).astype(int))

    # Language features
    if "LANX" in df.columns:
        df["LANX"] = np.where(df["LANX"].isna(), np.nan, (df["LANX"] == 1).astype(int))

    if "ENG" in df.columns:
        # ENG is usually ordinal (ability to speak English). Keep raw; you can bucket later if you want.
        pass

    if "LANP" in df.columns:
        # LANP is language code; keep raw categorical
        pass

    # Education tiers
    if "SCHL" in df.columns:
        def recode_education(val):
            if pd.isna(val) or val <= 15: return 0
            if val <= 17: return 1
            if val <= 20: return 2
            return 3
        df["SCHL_Tier"] = df["SCHL"].apply(recode_education)

    # --- Treat as categorical for one-hot encoding (later) ---
    for cat_col in ["OCCP", "CA_Region", "RAC1P", "year", "MSP", "WKL", "ENG", "LANP", "PUMA", "POBP"]:
        if cat_col in df.columns:
            df[cat_col] = df[cat_col].astype("string")

    return df


In [19]:
print("Unique OCCP codes:", df_train["OCCP"].nunique())

print("\nTop 10 most common:")
print(df_train["OCCP"].value_counts().head(10))

print("\nHow many OCCP values appear only once?")
print((df_train["OCCP"].value_counts() == 1).sum())


Unique OCCP codes: 530

Top 10 most common:
OCCP
440.0     32169
4720.0    23449
4760.0    22976
9130.0    21427
2310.0    20675
1021.0    19929
3255.0    19870
3602.0    19829
5240.0    18304
4700.0    17148
Name: count, dtype: int64

How many OCCP values appear only once?
0


In [20]:
df_train_pp = preprocess_acs_data_rf(df_train)
df_test_pp  = preprocess_acs_data_rf(df_test)

print("Done preprocessing.")


Done preprocessing.


In [21]:
for col in ["OCCP", "RAC1P", "CA_Region", "year"]:
    print(col, "unique values:", df_train_pp[col].nunique())


OCCP unique values: 530
RAC1P unique values: 9
CA_Region unique values: 7
year unique values: 5


## 3) Choose features + one-hot encode

In [22]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Features aligned to your baseline notebook, with OCCP (full, ungrouped)
features = [
    "AGEP", "WKHP", "SEX", "DIS", "CIT", "Born_in_CA",
    "SCHL_Tier", "OCCP", "CA_Region", "RAC1P", "year"
]

missing_feats = [c for c in features if c not in df_train_pp.columns]
if missing_feats:
    print("WARNING: These features are missing from train:", missing_feats)

# split your features
num_features = ["AGEP", "WKHP", "SCHL_Tier"]
bin_features = ["SEX", "DIS", "CIT", "Born_in_CA"]   # already 0/1 but may have NaNs
cat_features = ["OCCP", "CA_Region", "RAC1P", "year"]

# Buckets ultra-rare categories into an "infrequent" bin (helps if any category is very high-cardinality)
# With your nunique counts, this won't change much, but it's safe.
MIN_FREQ = 5

preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_features),
        ("bin", SimpleImputer(strategy="most_frequent"), bin_features),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(
                handle_unknown="ignore",
                sparse_output=True,
                min_frequency=MIN_FREQ
            ))
        ]), cat_features),
    ],
    remainder="drop"
)

# --- Build X/y ---
X_train = df_train_pp[features].copy()
X_test  = df_test_pp[features].copy()

y_train = df_train_pp["poverty_risk_score"].astype(int)
y_test  = df_test_pp["poverty_risk_score"].astype(int)

# --- IMPORTANT FIX: remove pandas pd.NA ambiguity for sklearn ---
# 1) Ensure numeric-ish columns are real numeric with np.nan (not pd.NA)
for c in num_features + bin_features:
    X_train[c] = pd.to_numeric(X_train[c], errors="coerce")
    X_test[c]  = pd.to_numeric(X_test[c], errors="coerce")

# 2) Ensure categorical columns are plain Python objects, and missing is None (not pd.NA)
for c in cat_features:
    X_train[c] = X_train[c].astype("object")
    X_test[c]  = X_test[c].astype("object")
    X_train[c] = X_train[c].where(pd.notna(X_train[c]), None)
    X_test[c]  = X_test[c].where(pd.notna(X_test[c]), None)

# --- Fit/transform train, transform test ---
X_train_sparse = preprocess.fit_transform(X_train)
X_test_sparse  = preprocess.transform(X_test)

print("X_train_sparse shape:", X_train_sparse.shape)
print("X_test_sparse shape :", X_test_sparse.shape)
print("Sparse matrix type:", type(X_train_sparse))



X_train_sparse shape: (1857626, 559)
X_test_sparse shape : (378571, 559)
Sparse matrix type: <class 'scipy.sparse._csr.csr_matrix'>


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(
    n_estimators=250,
    class_weight="balanced_subsample",
    min_samples_leaf=10,
    min_samples_split=20,
    random_state=42,
    n_jobs=-1
)


rf.fit(X_train_sparse, y_train)
y_pred = rf.predict(X_test_sparse)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.88      0.61      0.72    285368
           1       0.22      0.33      0.26     51329
           2       0.12      0.28      0.16     20679
           3       0.15      0.36      0.21     21195

    accuracy                           0.54    378571
   macro avg       0.34      0.40      0.34    378571
weighted avg       0.71      0.54      0.60    378571



In [24]:


print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion matrix:
[[175045  52064  29163  29096]
 [ 14041  17074  11158   9056]
 [  4657   5655   5854   4513]
 [  4926   4292   4396   7581]]


## 6) Feature importances (top 25)

In [25]:
# Feature importance can be noisy with correlated / high-cardinality OHE.
# Still useful for quick sanity checks.
importances = pd.Series(rf.feature_importances_, index=df_train_pp.columns).sort_values(ascending=False)

display(importances.head(25))

# Optional quick plot
top_n = 25
top = importances.head(top_n).sort_values()
plt.figure(figsize=(8, 10))
plt.barh(top.index, top.values)
plt.title(f"Top {top_n} Feature Importances (Random Forest)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()


ValueError: Length of values (559) does not match length of index (28)

## 7) Optional: Quick train/validation split for tuning

In [None]:
# If you want a fast local check without full CV:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_df, y_train, test_size=0.2, random_state=42, stratify=y_train
)

X_tr_i = imputer.fit_transform(X_tr)
X_val_i = imputer.transform(X_val)

rf_quick = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf_quick.fit(X_tr_i, y_tr)
val_pred = rf_quick.predict(X_val_i)

print("=== Quick Validation Performance ===")
print(classification_report(y_val, val_pred))
