# Assignment 1
Authors: Kamil Kojs, János Máté and Jorge del Pozo Lérida

### Dataset
US Census data from https://github.com/zykls/folktables. 

We use data of individuals from the state California in 2018. 
More details on the dataset can be found in the accompanying
paper at https://arxiv.org/pdf/2108.04884.pdf.

In [2]:
from folktables.acs import adult_filter
from folktables import ACSDataSource
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)

feature_names = ['AGEP', # Age
                 "CIT", # Citizenship status
                 'COW', # Class of worker
                 "ENG", # Ability to speak English
                 'SCHL', # Educational attainment
                 'MAR', # Marital status
                 "HINS1", # Insurance through a current or former employer or union
                 "HINS2", # Insurance purchased directly from an insurance company
                 "HINS4", # Medicaid
                 "RAC1P", # Recoded detailed race code
                 'SEX']

target_name = "PINCP" # Total person's income

def data_processing(data, features, target_name:str, threshold: float = 35000):
    df = data
    ### Adult Filter (STARTS) (from Foltktables)
    df = df[~df["SEX"].isnull()]
    df = df[~df["RAC1P"].isnull()]
    df = df[df['AGEP'] > 16]
    df = df[df['PINCP'] > 100]
    df = df[df['WKHP'] > 0]
    df = df[df['PWGTP'] >= 1]
    ### Adult Filter (ENDS)
    ### Groups of interest
    sex = df["SEX"].values
    ### Target
    df["target"] = df[target_name] > threshold
    target = df["target"].values
    df = df[features + ["target", target_name]] ##we want to keep df before one_hot encoding to make Bias Analysis
    df_processed = df[features].copy()
    cols = [ "HINS1", "HINS2", "HINS4", "CIT", "COW", "SCHL", "MAR", "SEX", "RAC1P"]
    df_processed = pd.get_dummies(df_processed, prefix=None, prefix_sep='_', dummy_na=False, columns=cols, drop_first=True)
    df_processed = pd.get_dummies(df_processed, prefix=None, prefix_sep='_', dummy_na=True, columns=["ENG"], drop_first=True)
    return df_processed, df, target, sex

data, data_original, target, group = data_processing(acs_data, feature_names, target_name)

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    data, target, group, test_size=0.2, random_state=0)

In [3]:
data.describe()

Unnamed: 0,AGEP,HINS1_2,HINS2_2,HINS4_2,CIT_2,CIT_3,CIT_4,CIT_5,COW_2.0,COW_3.0,...,RAC1P_4,RAC1P_5,RAC1P_6,RAC1P_7,RAC1P_8,RAC1P_9,ENG_2.0,ENG_3.0,ENG_4.0,ENG_nan
count,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0,...,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0,195665.0
mean,42.734914,0.348913,0.865377,0.856689,0.002009,0.012981,0.177078,0.135262,0.0706,0.080571,...,6.6e-05,0.0023,0.167168,0.003256,0.11649,0.041939,0.087231,0.056183,0.019227,0.577916
std,14.884622,0.476628,0.341321,0.350391,0.044772,0.113194,0.381736,0.342004,0.256157,0.272176,...,0.008151,0.047902,0.373127,0.056965,0.320812,0.20045,0.282174,0.230275,0.137321,0.493893
min,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,42.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,55.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,94.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Task1: Classifiers and fairness considerations

### Black-box model

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics  

# Scale training and test data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {}

# RandomForest
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
models['RandomForest'] = [rf_model, y_pred_rf]

# GradientBoostingClassifier
learning_rate = 0.5
gb_model = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
gb_model.fit(X_train_scaled,y_train)
y_pred_gb = gb_model.predict(X_test_scaled)
models['GradientBoostingClassifier'] = [gb_model, y_pred_gb]

# # Support Vector Machine
# svm_model = SVC()
# svm_model.fit(X_train_scaled, y_train)
# black_models.append(svm_model)

# TO DO:
# - Apply onehot encoding 
# - Extra scaling/different required?
# - Feature engieering required?
# - Add predicition of y_pred
# - SVMtaking too long


### White-box model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


# LogisticRegression
logreg_model = LogisticRegression(random_state=16)
logreg_model.fit(X_train_scaled, y_train)
y_pred_logreg = logreg_model.predict(X_test_scaled)
models['LogisticRegression'] = [logreg_model, y_pred_logreg]

# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
y_pred_dt = dt_model.predict(X_test_scaled)
models['DecisionTree'] = [dt_model, y_pred_dt]





### Fairness metrics

In [25]:
def get_fairnessmetrics(model):
    # Code for calculating fairnes metrics

    return None, None, None
def plot_fairnessmetrics(model):
    # Code for plotting results of fairness data

    return None

In [26]:
statisticalparities = []
equalized_odds = []
equalityofoutcomes = []

for model_name, model in models.items():
    print(model_name)
    statisticalparity, equalizedodd, eq_of_outcome = get_fairnessmetrics(model)

    # Make plots and save into object

    # Append metrics and plots

RandomForest
GradientBoostingClassifier
LogisticRegression
DecisionTree


### Countermeasures for fairness

In [27]:
# Change classification pipeline to fulfill ONE fairness criteria

# Task2: Explaining white-box models

# Task3: Model-agnostic explanations

# Task4: Reflection