In [1]:
import pandas as pd
from schema import INDEX_COL, PREDICTOR_COLS, PROTECTED_GROUP_COL, RETAINED_COL
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier


In [2]:
predictor_cols = list(PREDICTOR_COLS)
target_cols = [PROTECTED_GROUP_COL, RETAINED_COL]
usecols = [INDEX_COL] + target_cols + predictor_cols
train = pd.read_csv(
    "../data/train.csv",
    index_col="UNIQUE_ID",
    usecols=usecols,
    na_values=" "
)
train.dropna(subset=target_cols, inplace=True)
train.head()


Unnamed: 0_level_0,Protected_Group,Retained,SJ_Most_1,SJ_Least_1,SJ_Time_1,SJ_Most_2,SJ_Least_2,SJ_Time_2,SJ_Most_3,SJ_Least_3,...,PScale11_Q4,PScale12_Q1,PScale12_Q2,PScale12_Q3,PScale12_Q4,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5
UNIQUE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
245021089,0.0,1,3.0,4.0,95.0,3.0,2.0,101.0,1.0,4.0,...,1.0,1.0,1.0,3.0,4.0,1.0,2.0,3.0,2.0,1.0
245181465,1.0,0,3.0,4.0,53.0,3.0,2.0,57.0,1.0,4.0,...,1.0,1.0,1.0,4.0,4.0,2.0,1.0,4.0,4.0,4.0
229682665,1.0,0,2.0,4.0,89.0,1.0,4.0,73.0,1.0,3.0,...,4.0,1.0,1.0,4.0,4.0,1.0,1.0,4.0,4.0,4.0
245174982,0.0,1,2.0,1.0,110.0,1.0,2.0,86.0,1.0,4.0,...,2.0,1.0,1.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0
244979030,0.0,1,3.0,2.0,35.0,3.0,2.0,21.0,1.0,4.0,...,4.0,2.0,1.0,4.0,4.0,2.0,1.0,4.0,3.0,2.0


In [3]:
X = train[predictor_cols]
y = train[target_cols]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)


In [4]:
estimator = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=4)
model = OneVsRestClassifier(estimator)


In [5]:
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('model', model)
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


In [6]:
accuracy_score(y_test[[PROTECTED_GROUP_COL]], y_pred[:, 0])


0.7363646671958272

In [7]:
accuracy_score(y_test[[RETAINED_COL]], y_pred[:, 1])


0.5568658578070076