# Authors: Jakub Bandurski, Anirban Das

## This notebook performs all necessary computations needed to generate predictions of the final model for the classification task

To execute this file simply install all dependencies and run cell by cell.

Input: train and test sets provided by the lecturer

Output: final prediction, processed train and test sets

## 0. Imports

In [19]:
import pandas as pd
import numpy as np
from time import time
# preprocessing
# explicitly require this experimental feature to run imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder
# model
import xgboost as xgb

## 1. Train data preprocessing

In [29]:
# import data set
client_attrition = pd.read_csv('./../data/client_attrition_train.csv')
# encode dependant varaible as 0 and 1
client_attrition['account_status'] = client_attrition['account_status'].map(dict(closed=1, open=0))
# drop target var
y_train = client_attrition['account_status']
client_attrition.drop('account_status',axis=1,inplace=True)
# drop id
client_attrition.drop(labels='customer_id',axis=1,inplace=True)
# OneHotEncoding of not missing columns
client_attrition = pd.get_dummies(client_attrition, columns=['customer_education', 'customer_civil_status','credit_card_classification'])
# select numeric columns
numeric_columns = list(client_attrition.select_dtypes(exclude = ['object']).columns)
# select categorical columns
categorical_columns = list(client_attrition.select_dtypes(include = ['object']).columns)
# scaling before imputation
client_attrition_scaled = (client_attrition[numeric_columns]-client_attrition[numeric_columns].mean())/client_attrition[numeric_columns].std()
client_attrition_scaled['customer_sex'] = client_attrition['customer_sex']
client_attrition_scaled['customer_salary_range'] = client_attrition['customer_salary_range']
#instantiate both packages to use
encoder = OrdinalEncoder()
imputer = IterativeImputer(skip_complete=True,max_iter=10,verbose=2,random_state=12345)
# create a list of categorical columns to iterate over
cat_cols = categorical_columns

def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data
for columns in cat_cols:
    encode(client_attrition_scaled[columns])
# impute data and convert 
client_attrition_imputed = pd.DataFrame(imputer.fit_transform(client_attrition_scaled))
client_attrition_imputed.columns = client_attrition_scaled.columns
client_attrition_imputed[["customer_sex","customer_salary_range"]] = client_attrition_imputed[["customer_sex","customer_salary_range"]].round()
# convert customer_sex and customer_salary_range to factors and then one hot encode
client_attrition_imputed['customer_sex'] = client_attrition_imputed['customer_sex'].astype(int).astype(str)
client_attrition_imputed['customer_sex'] = client_attrition_imputed['customer_sex'].map({'0':"F", '1':"M"})
client_attrition_imputed['customer_salary_range'] = client_attrition_imputed['customer_salary_range'].astype(int).astype(str)
client_attrition_imputed['customer_salary_range'] = client_attrition_imputed['customer_salary_range'].map({'0':"120K and more", '1':"40-60K", '2':'60-80K','3':'80-120K','4':'Uknown','5':'below 40K'})
# One Hot Encode these columns
client_attrition_imputed = pd.get_dummies(client_attrition_imputed, columns=['customer_sex', 'customer_salary_range'])
# Scale again, already scaled columns wont be affected, imputed and new columns will, all columns are now numeric
client_attrition_imputed_scaled = (client_attrition_imputed-client_attrition_imputed.mean())/client_attrition_imputed.std()
# add y_train to train set
client_attrition_imputed_scaled = pd.concat([y_train, client_attrition_imputed_scaled],axis=1)
# save to csv
client_attrition_imputed_scaled.to_csv('client_attrition_train_processed.csv',sep=";", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.notnull()] = np.squeeze(impute_ordinal)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.notnull()] = np.squeeze(impute_ordinal)


[IterativeImputer] Completing matrix with shape (10127, 31)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.09
[IterativeImputer] Change: 3.112848513312019, scaled tolerance: 0.02489482411360622 
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.17
[IterativeImputer] Change: 1.0348486648124977, scaled tolerance: 0.02489482411360622 
[IterativeImputer] Ending imputation round 3/10, elapsed time 0.26
[IterativeImputer] Change: 0.28828216068862855, scaled tolerance: 0.02489482411360622 
[IterativeImputer] Ending imputation round 4/10, elapsed time 0.34
[IterativeImputer] Change: 0.07998240242181298, scaled tolerance: 0.02489482411360622 
[IterativeImputer] Ending imputation round 5/10, elapsed time 0.42
[IterativeImputer] Change: 0.022189352081924962, scaled tolerance: 0.02489482411360622 
[IterativeImputer] Early stopping criterion reached.


## 2. Model training

In [30]:
y_train = client_attrition_imputed_scaled['account_status']
X_train = client_attrition_imputed_scaled.drop('account_status',axis=1)

# set hyperparameters
params = {
    'eta': 1,
    'max_depth': 2,
    'lambda': 1,
    'min_child_weight': 5,
    'gamma': 0.01,
    'colsample_bytree': 1
}
# train model on train set
start = time()
model = xgb.XGBClassifier()
model.fit(X_train,y_train)
print(f"Training time: {time()-start} seconds")

Training time: 0.38705992698669434 seconds


## 3. Test data preprocessing

In [32]:
# import data
client_attrition_test = pd.read_csv('./../data/client_attrition_test.csv')
# drop id
client_attrition_test.drop(labels='customer_id',axis=1,inplace=True)
# OneHotEncoding of not missing columns
client_attrition_test = pd.get_dummies(client_attrition_test, columns=['customer_education', 'customer_civil_status','credit_card_classification'])
# Add column of 0s for Platinium value
client_attrition_test['credit_card_classification_Platinum'] = 0
# select numeric columns
numeric_columns_test = list(client_attrition_test.select_dtypes(exclude = ['object']).columns)
# select categorical columns
categorical_columns = list(client_attrition_test.select_dtypes(include = ['object']).columns)

################################ knn?
# scaling before imputation
client_attrition_test_scaled = (client_attrition_test[numeric_columns_test]-client_attrition_test[numeric_columns_test].mean())/client_attrition_test[numeric_columns_test].std()
client_attrition_test_scaled['credit_card_classification_Platinum'] = 0
client_attrition_test_scaled['customer_sex'] = client_attrition_test['customer_sex']
client_attrition_test_scaled['customer_salary_range'] = client_attrition_test['customer_salary_range']
# Imputation process
# initiate both functions to use
encoder = OrdinalEncoder()
imputer = IterativeImputer(skip_complete=True,max_iter=10,verbose=2,random_state=12345)
# create a list of categorical columns to iterate over
cat_cols = categorical_columns

def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    # retains only non-null values
    nonulls = np.array(data.dropna())
    # reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    # encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    # Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

# create a for loop to iterate through each column in the data
for columns in cat_cols:
    encode(client_attrition_test_scaled[columns])
# impute data and convert 
client_attrition_test_imputed = pd.DataFrame(imputer.fit_transform(client_attrition_test_scaled))
client_attrition_test_imputed.columns = client_attrition_test_scaled.columns
client_attrition_test_imputed[["customer_sex","customer_salary_range"]] = client_attrition_test_imputed[["customer_sex","customer_salary_range"]].round()
# convert customer_sex and customer_salary_range to factors and then one hot encode
client_attrition_test_imputed['customer_sex'] = client_attrition_test_imputed['customer_sex'].astype(int).astype(str)
client_attrition_test_imputed['customer_sex'] = client_attrition_test_imputed['customer_sex'].map({'0':"F", '1':"M"})
client_attrition_test_imputed['customer_salary_range'] = client_attrition_test_imputed['customer_salary_range'].astype(int).astype(str)
client_attrition_test_imputed['customer_salary_range'] = client_attrition_test_imputed['customer_salary_range'].map({'0':"120K and more", '1':"40-60K", '2':'60-80K','3':'80-120K','4':'Uknown','5':'below 40K'})
client_attrition_test_imputed = pd.get_dummies(client_attrition_test_imputed, columns=['customer_sex', 'customer_salary_range'])
# scale again, already scaled columns wont be affected, imputed and new columns will, all columns are now numeric
client_attrition_test_imputed_scaled = (client_attrition_test_imputed-client_attrition_test_imputed.mean())/client_attrition_test_imputed.std()
client_attrition_test_imputed_scaled['credit_card_classification_Platinum'] = 0
# save processed data set
client_attrition_test_imputed_scaled.to_csv('client_attrition_test_processed.csv',sep=";")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.notnull()] = np.squeeze(impute_ordinal)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.notnull()] = np.squeeze(impute_ordinal)


[IterativeImputer] Completing matrix with shape (5063, 31)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.06
[IterativeImputer] Change: 2.4077439150829094, scaled tolerance: 0.07114070703550365 
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.15
[IterativeImputer] Change: 0.5850500650415553, scaled tolerance: 0.07114070703550365 
[IterativeImputer] Ending imputation round 3/10, elapsed time 0.20
[IterativeImputer] Change: 0.09080384604186786, scaled tolerance: 0.07114070703550365 
[IterativeImputer] Ending imputation round 4/10, elapsed time 0.25
[IterativeImputer] Change: 0.014075500465657309, scaled tolerance: 0.07114070703550365 
[IterativeImputer] Early stopping criterion reached.


## 4. Prediction

In [41]:
# correct arrangemnt of columns
cols_when_model_builds = model.get_booster().feature_names
client_attrition_test_imputed_scaled = client_attrition_test_imputed_scaled[cols_when_model_builds]
X_test = client_attrition_test_imputed_scaled
# prediction
y_pred = model.predict(X_test)
# save prediction
np.savetxt("classification_prediction.csv", y_pred.astype(int), fmt='%i' , delimiter=",")

In [37]:
# fraction of predicted 1s is similar to the train set 16%
np.count_nonzero(y_pred)/y_pred.size

0.13114754098360656