In [None]:
!pip install folktables
!pip install numpy pandas scikit-learn

import numpy as np
import pandas as pd
from folktables import ACSDataSource, BasicProblem, adult_filter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix



In [None]:
# Constants
SURVEY_YEAR = '2018'
HORIZON = '1-Year'
TEST_SIZE = 0.2  # 20% Test Data
RANDOM_STATE = 42  # For reproducibility

In [None]:
def load_data():
    data_source = ACSDataSource(survey_year=SURVEY_YEAR, horizon=HORIZON, survey='person')
    return data_source.get_data(states=["IL","IA","NY"], download=True)


def define_problem(threshold):
    return BasicProblem(
        features=['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELP', 'WKHP', 'SEX', 'RAC1P'],
        target='PINCP',
        target_transform=lambda x: x > threshold,  # Convert income to binary classification
        group='RAC1P',
        preprocess=adult_filter,
        postprocess=lambda x: np.nan_to_num(x, -1)  # Handle NaN values
    )


def preprocess_data(problem, raw_data):
    X, y, _ = problem.df_to_numpy(raw_data)
    return X, y


def split_and_standardize(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

In [None]:
def train_logistic_regression(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    true_positive_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

    return {
        "accuracy": accuracy,
        "true_positive_rate": true_positive_rate,
        "false_positive_rate": false_positive_rate
    }


In [None]:
# === Main Execution ===
# if __name__ == "__main__":
thresholds = [50000, 10000, 20000, 75000, 100000]

raw_data = load_data()

for threshold in thresholds:
    problem = define_problem(threshold)
    X, y = preprocess_data(problem, raw_data)

    X_train, X_test, y_train, y_test = split_and_standardize(X, y)

    model = train_logistic_regression(X_train, y_train)

    metrics = evaluate_model(model, X_test, y_test)

    print(f"\n=== Model Evaluation for threshold {threshold} ===")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"True Positive Rate (Recall): {metrics['true_positive_rate']:.4f}")
    print(f"False Positive Rate: {metrics['false_positive_rate']:.4f}")

Downloading data for 2018 1-Year person survey for IL...

=== Model Evaluation for threshold 50000 ===
Accuracy: 0.7675
True Positive Rate (Recall): 0.6673
False Positive Rate: 0.1670

=== Model Evaluation for threshold 10000 ===
Accuracy: 0.9102
True Positive Rate (Recall): 0.9758
False Positive Rate: 0.4882

=== Model Evaluation for threshold 20000 ===
Accuracy: 0.8495
True Positive Rate (Recall): 0.9471
False Positive Rate: 0.4299

=== Model Evaluation for threshold 75000 ===
Accuracy: 0.8235
True Positive Rate (Recall): 0.4381
False Positive Rate: 0.0631

=== Model Evaluation for threshold 100000 ===
Accuracy: 0.8851
True Positive Rate (Recall): 0.2742
False Positive Rate: 0.0257


In [None]:
from xgboost import XGBClassifier

# raw_data = load_data()
problem = define_problem(50000)
X, y = preprocess_data(problem, raw_data)

X_train, X_test, y_train, y_test = split_and_standardize(X, y)

model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print("\n=== Model Evaluation (XGBoost) ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"True Positive Rate (Recall): {tpr:.4f}")
print(f"False Positive Rate: {fpr:.4f}")


=== Model Evaluation (XGBoost) ===
Accuracy: 0.8059
True Positive Rate (Recall): 0.7434
False Positive Rate: 0.1533
