In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import joblib

from sklearn.ensemble import GradientBoostingClassifier # gradient boosting
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score

from datetime import datetime

In [38]:
df_train = pd.read_csv('cs5228_finalproject/df_train_v2.csv')
df_val = pd.read_csv('cs5228_finalproject/df_val_v2.csv')
df_test = pd.read_csv('cs5228_finalproject/df_test_new.csv')
df_full_train = pd.concat([df_train, df_val[df_train.columns]], axis = 0).reset_index(drop=True)

In [3]:
categorical_cols = ['workclass', 'education', 'occupation', 'native-country']
float_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

value_dict = {}
col_ordering = []

In [4]:
def f1(y_test, y_pred):
    return round(f1_score(y_test, y_pred, average='weighted') * 100, 2)

def acc(y_test, y_pred):
    return round(accuracy_score(y_test, y_pred) * 100, 2)

# Feature Engineering

In [5]:
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.cols = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        temp_x = X
        for col in self.cols:
            le = LabelEncoder()
            temp = pd.DataFrame(le.fit_transform(temp_x[col]), columns = [col])
            temp_x = pd.concat([temp_x.drop([col], axis = 1), temp], axis = 1)
        return temp_x

In [6]:
# Grouping for education
before_hs = [' 12th', ' 10th', ' 9th', ' 5th-6th', ' 11th', ' 7th-8th', ' 1st-4th', ' Preschool']
assoc = [' Assoc-voc', ' Assoc-acdm']
post_grad = [' Masters', ' Doctorate', ' Prof-school']

# Grouping for workclass
govt = [' State-gov', ' Local-gov', ' Federal-gov']
others = [' ?', ' Never-worked', ' Without-pay']

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        temp_x = X
        
        temp_x.replace(' ?', 'others')
        # Feature Engineering - creating investor column which is 1 if there is any capital-gain or capital-loss
        temp_x['investor'] = 0
        temp_x.loc[temp_x[temp_x['capital-gain'] != temp_x['capital-gain'].min()].index, 'investor'] = 1
        temp_x.loc[temp_x[temp_x['capital-loss'] != temp_x['capital-loss'].min()].index, 'investor'] = 1
        
        # Feature Engineering - grouping education
        temp_x.loc[temp_x[temp_x['education'].isin(before_hs)].index, 'education'] = 'before-hs'
        temp_x.loc[temp_x[temp_x['education'].isin(assoc)].index, 'education'] = 'assoc'
        temp_x.loc[temp_x[temp_x['education'].isin(post_grad)].index, 'education'] = 'post_grad'
        
        # Feature Engineering - grouping marital-status column
        temp_x['marital-status'] = temp_x['marital-status'].map({' Never-married': 'Single', ' Divorced': 'Single', 
                                                       ' Married-civ-spouse': 'Married', ' Married-spouse-absent': 'Married', 
                                                       ' Married-AF-spouse': 'Married', ' Never-married': 'Single', 
                                                       ' Separated': 'Single', ' Widowed': 'Single'})
        
        # Feature Engineering - grouping workclass column
        temp_x.loc[temp_x[temp_x['workclass'].isin(govt)].index, 'workclass'] = 'govt'
        temp_x.loc[temp_x[temp_x['workclass'].isin(others)].index, 'workclass'] = 'others'
        return temp_x

In [7]:
class DropFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.cols = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        temp_x = X
        return temp_x.drop(self.cols, axis = 1)

In [8]:
class Clean(BaseEstimator, TransformerMixin):
    def __init__(self, set_value_dict=False):
        self.set_value_dict = set_value_dict
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if (self.set_value_dict):
            for col in X.columns:
                global value_dict
                value_dict[col] = X[col].unique()
            return X
        else:
            for col in categorical_cols:
                for i in range(len(X[col])):
                    if X.loc[i, col] not in value_dict[col]:
                        X.loc[i, col] = 'others'
            return X

In [9]:
# Pass in categorical columns
class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self, cat_columns, float_columns, get_columns=False):
        self.cat_columns = cat_columns
        self.float_columns = float_columns
        self.get_columns = get_columns
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        temp_x = X
        for col in temp_x.columns:
            if col in self.cat_columns:
                temp_x[col] = temp_x[col].astype('category')
            elif col in self.float_columns:
                temp_x[col] = temp_x[col].astype('float64')
            else:
                temp_x[col] = temp_x[col].astype('int64')
        dummies = pd.get_dummies(temp_x)
        if self.get_columns:
            global col_ordering
            col_ordering = dummies.columns
        return dummies

In [10]:
class SetFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, set_columns=False):
        self.set_columns = set_columns
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.set_columns:
            for col in col_ordering:
                if col not in X.columns:
                    X[col] = 0
            return X[col_ordering]
        return X

In [11]:
pp_train = Pipeline([
    ('feature_engineering', FeatureEngineering()),
    ('gender_transformer', LabelEncoderTransformer(['sex', 'marital-status'])),
    ('drop_features', DropFeatures(['education-num', 'relationship'])),
    ('clean', Clean(set_value_dict=True)),
    ('get_dummies', GetDummies(categorical_cols, float_cols, get_columns = True)),
    ('set_features', SetFeatures())
])

df_train = pp_train.fit_transform(df_train)
df_train_y = df_train['class']
df_train_x = df_train.drop('class', axis = 1)

In [12]:
pp_test = Pipeline([
    ('feature_engineering', FeatureEngineering()),
    ('gender_transformer', LabelEncoderTransformer(['sex', 'marital-status'])),
    ('drop_features', DropFeatures(['education-num', 'relationship'])),
    ('clean', Clean()),
    ('get_dummies', GetDummies(categorical_cols, float_cols)),
    ('set_features', SetFeatures(set_columns=True))
])
df_val = pp_test.fit_transform(df_val)
df_val_y = df_val['class']
df_val_x = df_val.drop('class', axis = 1)

In [13]:
df_test = pp_test.fit_transform(df_test)
df_test_y = df_test['class']
df_test_x = df_test.drop('class', axis = 1)

In [14]:
parameters = {'loss': ['deviance', 'exponential'],
              'min_samples_leaf': [5, 6, 7, 8],
              'max_depth': [i for i in range(3,10,1)],
             }
gbm = GradientBoostingClassifier(warm_start=True, random_state=5228)
kf = StratifiedKFold(n_splits=10, random_state=5228, shuffle=True)
clf = GridSearchCV(gbm, parameters, cv=kf, scoring='f1_weighted', verbose = 1, n_jobs = 4)
clf.fit(df_train_x, df_train_y)

Fitting 10 folds for each of 56 candidates, totalling 560 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   39.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 11.9min
[Parallel(n_jobs=4)]: Done 560 out of 560 | elapsed: 16.8min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=5228, shuffle=True),
             error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2...
                                                  n_iter_no_change=None,
                                                  presort='deprecated',
          

In [15]:
y_pred_xgbc = clf.predict(df_val_x)
print(f1(df_val_y, y_pred_xgbc))
print(acc(df_val_y, y_pred_xgbc))

86.05
86.6


# Retraining with full train dataset

In [40]:
value_dict = {}
col_ordering = []

pp_train = Pipeline([
    ('feature_engineering', FeatureEngineering()),
    ('gender_transformer', LabelEncoderTransformer(['sex', 'marital-status'])),
    ('drop_features', DropFeatures(['education-num', 'relationship'])),
    ('clean', Clean(set_value_dict=True)),
    ('get_dummies', GetDummies(categorical_cols, float_cols, get_columns = True)),
    ('set_features', SetFeatures())
])

df_full_train = pp_train.fit_transform(df_full_train)
df_full_train_y = df_full_train['class']
df_full_train_x = df_full_train.drop('class', axis = 1)

In [41]:
gbm_new = GradientBoostingClassifier(warm_start=True, random_state=5228)
gbm_new.set_params(**clf.best_params_)
gbm_new.fit(df_full_train_x, df_full_train_y)

gbm_new_pred = gbm_new.predict(df_test_x)
output_df = pd.DataFrame({'id': [i+1 for i in range(len(gbm_new_pred))], 'prediction': gbm_new_pred})
output_df.to_csv('predictions_gbm_' + datetime.now().strftime('%d%m%y') + '_all.csv', index = False)