In [1]:
# Get the training data
# Clean/preprocess/transform the data
# Train a machine learning model
# Evaluate and optimise the model
# Clean/preprocess/transform new data
# Fit the model on new data to make predictions.

In [2]:
import pandas as pd
import numpy as np
import load_data
from sklearn.model_selection import train_test_split
import imblearn
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import accuracy_score,classification_report,recall_score,confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc, plot_confusion_matrix,plot_roc_curve
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestRegressor

from catboost import CatBoostClassifier

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [3]:
#Load Data
mypath = "../data/"
mydata = load_data.get_file_names(mypath)
data_files = load_data.load_copy_data(mydata, mypath)

test = data_files['data_merged']
data = data_files['data_merged']

file name: X_train_des
file name: y_test_des
file name: X_test_des
file name: train_month_1
file name: data_merged
file name: train_month_2
file name: test_month_1
file name: test_month_3
file name: test_month_2
file name: y_train_des
file name: train_month_3_with_target


In [4]:
load_data.calc_missing(data)

Unnamed: 0,Missing_Number,Missing_Percent
customer_education_y,47125,0.739831
customer_education,47125,0.739831
customer_education_x,47125,0.739831
customer_children,23364,0.366799
customer_children_y,23065,0.362105
customer_children_x,23056,0.361964
customer_relationship,14899,0.233904
customer_relationship_y,14476,0.227263
customer_relationship_x,14456,0.226949
customer_occupation_code_y,2002,0.03143


In [5]:
# bal_insurance_21: balance on "tak 21" life insurance
# bal_insurance_23: balance on "tak 23" life insurance
# cap_life_insurance_fixed_cap: capital for life insurance with fixed capital
# cap_life_insurance_decreasing_cap: capital for life insurance with decreasing capital
# prem_fire_car_other_insurance: premiums paid for fire/car/other insurance
# bal_personal_loan: outstanding balance on personal loans
# bal_mortgage_loan: outstanding balance on mortgage loans
# bal_current_account: balance on current (checkings) accounts
# bal_pension_saving: balance on pension (retirement) savings accounts
# bal_savings_account: balance on savings accounts
# bal_current_account_starter: balance on starter current (checkings) accounts
# bal_savings_account_starter: balance on starter savings accounts

def percentage_change(col1,col2):
    return ((col2 - col1) / col1) * 100


list_balances = [['bal_insurance_21_x', 'bal_insurance_21_y', 'bal_insurance_21'],
                 ['bal_insurance_23_x', 'bal_insurance_23_y', 'bal_insurance_23'],
                 ['bal_personal_loan_x', 'bal_personal_loan_y', 'bal_personal_loan'],
                 ['bal_mortgage_loan_x', 'bal_mortgage_loan_y', 'bal_mortgage_loan'],
                 ['bal_current_account_x', 'bal_current_account_y', 'bal_current_account'],
                 ['bal_pension_saving_x', 'bal_pension_saving_y', 'bal_pension_saving'],
                 ['bal_savings_account_x', 'bal_savings_account_y', 'bal_savings_account'],
                ]

for balance_type in list_balances:
    print(balance_type[2])
    data['{}_1'.format(balance_type[2])] = percentage_change(data[balance_type[0]],data[balance_type[1]]) 
    data['{}_2'.format(balance_type[2])] = percentage_change(data[balance_type[1]],data[balance_type[2]]) 
    data['{}_3'.format(balance_type[2])] = percentage_change(data[balance_type[0]],data[balance_type[2]]) 

    data['{}_1'.format(balance_type[2])] = data['{}_1'.format(balance_type[2])].fillna(0)
    data['{}_2'.format(balance_type[2])] = data['{}_2'.format(balance_type[2])].fillna(0)
    data['{}_3'.format(balance_type[2])] = data['{}_3'.format(balance_type[2])].fillna(0)                                                     
                                                             

bal_insurance_21
bal_insurance_23
bal_personal_loan
bal_mortgage_loan
bal_current_account
bal_pension_saving
bal_savings_account


In [6]:
##Explore data for Debug purposes

# for column in data.columns:
    #if data[column].nunique() <= 10:
    #if data[column].dtypes == 'float64':
#     if column == 'target':
#         print(f'col name:{column} and unique values: {data[column].unique()}')

# data.dtypes
# data.isnull().sum()
# data.nunique()
data.corr()['target']

Unnamed: 0                            -0.005110
homebanking_active_x                   0.101653
has_homebanking_x                      0.099194
has_insurance_21_x                     0.012602
has_insurance_23_x                     0.004785
has_life_insurance_fixed_cap_x        -0.002113
has_life_insurance_decreasing_cap_x    0.092860
has_fire_car_other_insurance_x         0.025371
has_personal_loan_x                    0.061419
has_mortgage_loan_x                    0.092005
has_current_account_x                  0.093452
has_pension_saving_x                   0.006329
has_savings_account_x                 -0.048524
has_savings_account_starter_x          0.017424
has_current_account_starter_x          0.023986
bal_insurance_21_x                     0.010463
bal_insurance_23_x                     0.000897
cap_life_insurance_fixed_cap_x         0.001212
cap_life_insurance_decreasing_cap_x    0.086004
prem_fire_car_other_insurance_x        0.027040
bal_personal_loan_x                    0

In [7]:
#Construct test, train set
X = data.drop('target',axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
 # summarize class distribution
print(Counter(y_train))
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy=0.5)
# define undersample strategy
undersample = RandomUnderSampler(sampling_strategy=0.5)
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)
X_under, y_under = undersample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(y_over))
print(Counter(y_under))

Counter({0: 49408, 1: 1549})
Counter({0: 49408, 1: 24704})
Counter({0: 3098, 1: 1549})


In [9]:
#Classes to process dates to days and Children status to binary

class InbetweenDays(BaseEstimator):
    
    def __init__(self):
        pass
    
    def fit(self, documents, y=None):
        return self
    
    def transform(self, x_dataset):
        list_dates = ['customer_since_all_x', 'customer_since_bank_x', 'customer_birth_date_x']

        #Convert date columns into datetime format
        x_dataset['base_dt'] = pd.to_datetime('2018-01-01')
        x_dataset[list_dates] = x_dataset[list_dates].apply(pd.to_datetime)

        for col in list_dates:
            x_dataset[col] = abs(x_dataset['base_dt'].dt.year - x_dataset[col].dt.year)

        #Drop columns (base_dt)
        x_dataset = x_dataset.drop('base_dt', axis=1)
    
        return x_dataset
    

class ChildrenStatus(BaseEstimator):
    
    def __init__(self):
        pass
    
    def fit(self, documents, y=None):
        return self
    
    def transform(self, x_dataset):
        for column in ['customer_children', 'customer_children_y', 'customer_children_x']:
            x_dataset[column].replace(['mature', 'young', 'onebaby', 'adolescent', 'preschool', 'grownup'], 'yes', inplace=True)
            x_dataset[column].fillna('no', inplace=True)
    
        return x_dataset

In [10]:
# Features to preprocess

# Features to drop
drop_features = [
    'Unnamed: 0',
    'client_id', #ID not needed in the training data
    'customer_education_x', #Remove education b/c missing > 70%
    'customer_education_y', #
    'customer_education',#
]

# Duplicated Columns to drop
dup_cols = test.T.duplicated().reset_index()
dup_cols_list = dup_cols.loc[dup_cols[0], 'index'].tolist()
drop_features.extend(dup_cols_list)


# Categorical Features to Preprocess
categorical_features = [
    'customer_relationship', 'customer_relationship_y', 'customer_relationship_x', 
    'customer_occupation_code_x',
    'customer_children', 'customer_children_y', 'customer_children_x',
    'customer_gender_x',
#     'customer_postal_code', 'customer_postal_code_y', 'customer_postal_code_x', #error!!! ValueError: Found unknown categories [3471] in column 12 during transform (during testing)
    'customer_self_employed', 'customer_self_employed_y', 'customer_self_employed_x',
]

# Numerical Features to Preprocess
numeric_features = [
    'bal_insurance_21', 'bal_insurance_21_y', 'bal_insurance_21_x', 
    'bal_insurance_23', 'bal_insurance_23_y', 'bal_insurance_23_x',
    'cap_life_insurance_fixed_cap', 'cap_life_insurance_fixed_cap_y', 'cap_life_insurance_fixed_cap_x',
    'cap_life_insurance_decreasing_cap', 'cap_life_insurance_decreasing_cap_y', 'cap_life_insurance_decreasing_cap_x',
    'prem_fire_car_other_insurance', 'prem_fire_car_other_insurance_y', 'prem_fire_car_other_insurance_x',
    'bal_personal_loan', 'bal_personal_loan_y', 'bal_personal_loan_x',
    'bal_mortgage_loan', 'bal_mortgage_loan_y', 'bal_mortgage_loan_x',
    'bal_current_account', 'bal_current_account_y', 'bal_current_account_x',
    'bal_pension_saving', 'bal_pension_saving_y', 'bal_pension_saving_x', 
    'bal_savings_account', 'bal_savings_account_y', 'bal_savings_account_x',
    'bal_current_account_starter', 'bal_current_account_starter_y', 'bal_current_account_starter_x',
    'bal_savings_account_starter', 'bal_savings_account_starter_y', 'bal_savings_account_starter_x',
    'visits_distinct_so', 'visits_distinct_so_y', 'visits_distinct_so_x',
    'visits_distinct_so_areas', 'visits_distinct_so_areas_y', 'visits_distinct_so_areas_x',
    'customer_since_all_x', 'customer_since_bank_x', 'customer_birth_date_x',
]

In [11]:
# List of Duplicated Columns
dup_cols_list

['has_savings_account_starter_y',
 'customer_since_all_y',
 'customer_since_bank_y',
 'customer_gender_y',
 'customer_birth_date_y',
 'customer_postal_code_y',
 'customer_occupation_code_y',
 'customer_education_y',
 'customer_since_all',
 'customer_since_bank',
 'customer_gender',
 'customer_birth_date',
 'customer_postal_code',
 'customer_occupation_code',
 'customer_education']

In [12]:
# Replace NA with column mean, normalize numerical values
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    #('normalize', Normalizer()),
])

# Replace NA with column mode, encode categorical value to 0/1
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

In [13]:
# Pre-process pipeline that drops unnecessary features, transforms numerical and categorical values
preprocessor = ColumnTransformer(remainder='passthrough',
                                 transformers=[
                                     ('drop_columns', 'drop', drop_features),
                                     ('numeric', numeric_transformer, numeric_features),
                                     ('categorical', categorical_transformer, categorical_features)
                                ])

In [14]:
# Pipeline from preprocessing to fitting a model
pipeline = Pipeline(steps=[
    ('inbetween_days', InbetweenDays()),
    ('binarize_children', ChildrenStatus()),
    ('preprocessor', preprocessor),
    ('catboost_weighted', CatBoostClassifier(verbose=False,random_state=0)),#,scale_pos_weight=2)),
])

In [15]:
# pipeline.fit(X_train, y_train)
pipeline.fit(X_over, y_over)
#pipeline.fit(X_under, y_under)

  x_dataset['base_dt'] = pd.to_datetime('2018-01-01')


Pipeline(steps=[('inbetween_days', InbetweenDays()),
                ('binarize_children', ChildrenStatus()),
                ('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('drop_columns', 'drop',
                                                  ['Unnamed: 0', 'client_id',
                                                   'customer_education_x',
                                                   'customer_education_y',
                                                   'customer_education',
                                                   'has_savings_account_starter_y',
                                                   'customer_since_all_y',
                                                   'customer_since_ba...
                                                  ['customer_relationship',
                                                   'customer_relationship_y',
                                    

In [16]:
# With scale_pos_weight=5, minority class gets 5 times more impact and 5 times more correction than errors made on the majority class.
#catboost_5 = CatBoostClassifier(verbose=False,random_state=0,scale_pos_weight=5)

# Evaluate Model
y_pred = pipeline.predict(X_test)

print(f'Accuracy: {round(accuracy_score(y_test, y_pred),4)}')
print(f'Recall: {round(recall_score(y_test, y_pred),4)}')
print(f'ROC_AUC: {round(roc_auc_score(y_test, y_pred),4)}')
print(f'Precision: {round(precision_score(y_test, y_pred),4)}')

Accuracy: 0.946
Recall: 0.1071
ROC_AUC: 0.5389
Precision: 0.097
