## Binary Classification Income Prediction

Columns | Definition
---|---------
`age` | Age of Persons
`workclass` | Describe work type 
`fnlwgt` | Financial Weight
`education` | Person's education level
`martial status` | Person's martial status
`occupation` | Person's usual or principal work or business
`sex` | Gender of Person
`race` | Person's race
`capital gain` | Person's capital gain
`capital loss` | Person's capital loss
`hours per hour` | Earn per hour
`native country` | Persons native country
`income` | Whether <50k or not

In [124]:
import numpy as np
import pandas as pd

In [125]:
data = pd.read_csv("archive/train.csv")

In [126]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income_>50K'],
      dtype='object')

#### Removing spaces

In [127]:
data.columns = [cols.replace(' ', '') for cols in data.columns]
data["education"] = [cols.replace(' ', '') for cols in data["education"]]
data["marital-status"] = [cols.replace(' ', '') for cols in data["marital-status"]]
data["relationship"] = [cols.replace(' ', '') for cols in data["relationship"]]
data["race"] = [cols.replace(' ', '') for cols in data["race"]]
data["gender"] = [cols.replace(' ', '') for cols in data["gender"]]

#### Handling missing values

In [128]:
# already checked it's safe to remove
data = data.replace('?', np.nan)
original = len(data)
data.dropna(inplace=True,axis=0)
without_missing = len(data)
without_missing / original

0.926519098209614

#### Dummies for categorical columns

In [129]:
cat_columns = ['workclass','education','marital-status', 'occupation', 'relationship', 'race', 'gender','native-country']
df_dumy = pd.get_dummies(data, columns = cat_columns)

#### Here, data is ready

## Consortia testing

Let's assume a number of players, even split for now, and each comes up with their own benchmark as well

In [130]:
num_players = 4
benchmark_split = 0.1

#### Simulate raw data for each player

In [131]:
def split_even(data, num_splits):
    di_size = round(len(data) / num_splits)
    dis = []
    index = 0
    while index < len(df_dumy):
        di = df_dumy[index:index + di_size]
        index += di_size
        dis.append(di)
    return dis

In [132]:
dis = split_even(df_dumy, num_players)

#### Now we create the benchmark (test data) for each player

In [133]:
from sklearn.model_selection import train_test_split

In [134]:
dis_xy = []
for di in dis:
    X = di.drop("income_>50K",axis=1)
    y = di["income_>50K"]
    dis_xy.append((X, y))

In [135]:
dis_bis = []
for X, y in dis_xy:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=benchmark_split, random_state=101)
    dis_bis.append(((X_train, y_train), (X_test, y_test)))

#### Train process for each player

In [136]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score

In [137]:
def platform_preprocess(X_train, X_test):
    # preprocess data
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    return scaled_X_train, scaled_X_test

def platform_train_process(X_train, y_train):    
    # model selection and training
    parameters_for_testing = {
    "n_estimators"    : [50,100,150,200,250] ,
     "max_features"        : [1,2,3,4,5],
    }
    model = RandomForestClassifier()
    kfold = KFold(n_splits=10, random_state=None)  # None, changed from 42
    grid_cv = GridSearchCV(estimator=model, param_grid=parameters_for_testing, scoring='accuracy', cv=kfold)
    result = grid_cv.fit(X_train, y_train)
    print("Best: {} using {}".format(result.best_score_, result.best_params_))
    
    # model training
    tuned_model = RandomForestClassifier(n_estimators=result.best_params_['n_estimators'],
                                         max_features=result.best_params_['max_features'])
    tuned_model.fit(X_train, y_train)
    
    return tuned_model

def platform_test_model(model, X_test, y_test):
    # prediction on test data (benchmark)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

In [138]:
models = []
for ((X_train, y_train), (X_test, y_test)) in dis_bis:
    # preprocess
    pp_X_train, pp_X_test = platform_preprocess(X_train, X_test)
    # train
    model = platform_train_process(pp_X_train, y_train)
    models.append((model, pp_X_test, y_test))

Best: 0.8386998614239524 using {'max_features': 5, 'n_estimators': 100}
Best: 0.8407742162834 using {'max_features': 5, 'n_estimators': 100}
Best: 0.8498314229521938 using {'max_features': 5, 'n_estimators': 200}
Best: 0.8486136442643326 using {'max_features': 5, 'n_estimators': 100}


In [139]:
# Model quality on individual bi
for model, pp_X_test, y_test in models:
    accuracy = platform_test_model(model, pp_X_test, y_test)
    print("Acc: " + str(accuracy))

Acc: 0.831207065750736
Acc: 0.873405299313052
Acc: 0.844946025515211
Acc: 0.845927379784102


#### Model quality of each player's model on the consortia benchmark

In [140]:
# Merge bi into bI
pp_Xs_test = [pp_X_test for _, pp_X_test, _ in models]
ys_test = [y_test for _, _, y_test in models]
pp_XI_test = np.concatenate(pp_Xs_test, axis=0)
yI_test = np.concatenate(ys_test, axis=0)

In [141]:
# Model quality on bI
for model, _, _ in models:
    accuracy = platform_test_model(model, pp_XI_test, yI_test)
    print("Acc: " + str(accuracy))

Acc: 0.8400392541707556
Acc: 0.8456820412168793
Acc: 0.8461727183513248
Acc: 0.8437193326790972


### Platform combining data and training the big model

In [147]:
##### Note this is using data transformed individually

In [148]:
Xs_train = [X_train for ((X_train, _), (_, _)) in dis_bis]
Xs_test = [X_test for ((_, _), (X_test, _)) in dis_bis]
ys_train = [y_train for ((_, y_train), (_, _)) in dis_bis]
ys_test = [y_test for ((_, _), (_, y_test)) in dis_bis]
XI_train = np.concatenate(Xs_train, axis=0)
XI_test = np.concatenate(Xs_test, axis=0)
yI_train = np.concatenate(ys_train, axis=0)
yI_test = np.concatenate(ys_test, axis=0)

In [153]:
pp_XI_train, pp_XI_test = platform_preprocess(XI_train, XI_test)
big_model = platform_train_process(pp_XI_train, yI_train)

Best: 0.847480427422374 using {'max_features': 5, 'n_estimators': 100}


In [154]:
accuracy = platform_test_model(big_model, pp_XI_test, yI_test)
print("Acc: " + str(accuracy))

Acc: 0.8483807654563298
