In [1]:
# Get the training data
# Clean/preprocess/transform the data
# Train a machine learning model
# Evaluate and optimise the model
# Clean/preprocess/transform new data
# Fit the model on new data to make predictions.

In [2]:
import pandas as pd
import numpy as np
import load_data

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,recall_score,confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc, plot_confusion_matrix,plot_roc_curve
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import Libraries for Modelling
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


# from imblearn.under_sampling import RandomUnderSampler
# from sklearn.pipeline import Pipeline
# from sklearn.base import BaseEstimator
# from sklearn.model_selection import train_test_split
# import imblearn
# from collections import Counter

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [3]:
#Load Data
mypath = "../data/"
mydata = load_data.get_file_names(mypath)
data_files = load_data.load_copy_data(mydata, mypath)

data = data_files['data_merged']
X_train = data_files['X_train'].copy()
X_val = data_files['X_val'].copy()
y_train = data_files['y_train']['target'].copy()
y_val = data_files['y_val']['target'].copy()

file name: X_train_des
file name: y_test_des
file name: X_test_des
file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: y_train_des
file name: train_month_3_with_target


In [4]:
# Features to preprocess

# Features to drop
drop_features = [
    'Unnamed: 0',
    'client_id', #ID not needed in the training data
    'customer_postal_code_x',
]

# Duplicated Columns to drop
dup_cols = data.T.duplicated().reset_index()
dup_cols_list = dup_cols.loc[dup_cols[0], 'index'].tolist()
drop_features.extend(dup_cols_list)
# dup_cols_list

# Categorical Features to Preprocess
categorical_features = [
    'customer_relationship', 'customer_relationship_y', 'customer_relationship_x', #unknown
    'customer_occupation_code_x',
    'customer_children', 'customer_children_y', 'customer_children_x',
    'customer_gender_x',
    'area_cat',
    'customer_self_employed', 'customer_self_employed_y', 'customer_self_employed_x',
    'visits_distinct_so', 'visits_distinct_so_y', 'visits_distinct_so_x',
    'visits_distinct_so_areas', 'visits_distinct_so_areas_y', 'visits_distinct_so_areas_x',
    'customer_education_x', #
    'customer_relationship_x', 'customer_relationship_y', 'customer_relationship',#
    'customer_children_x', 'customer_children_y', 'customer_children',
    'homebanking_active', 'homebanking_active_y', 'homebanking_active_x',
    'has_homebanking', 'has_homebanking_y', 'has_homebanking_x',
    'has_insurance_21', 'has_insurance_21_y', 'has_insurance_21_x',
    'has_insurance_23', 'has_insurance_23_y', 'has_insurance_23_x',
    'has_life_insurance_fixed_cap', 'has_life_insurance_fixed_cap_y', 'has_life_insurance_fixed_cap_x',
    'has_life_insurance_decreasing_cap', 'has_life_insurance_decreasing_cap_y', 'has_life_insurance_decreasing_cap_x',
    'has_fire_car_other_insurance', 'has_fire_car_other_insurance_y', 'has_fire_car_other_insurance_x',
    'has_personal_loan', 'has_personal_loan_y', 'has_personal_loan_x',
    'has_mortgage_loan', 'has_mortgage_loan_y', 'has_mortgage_loan_x',
    'has_current_account', 'has_current_account_y', 'has_current_account_x',
    'has_pension_saving', 'has_pension_saving_y', 'has_pension_saving_x',
    'has_savings_account', 'has_savings_account_y', 'has_savings_account_x',
    'has_current_account_starter', 'has_current_account_starter_y', 'has_current_account_starter_x',
    'has_savings_account_starter', 'has_savings_account_starter_y', 'has_savings_account_starter_x',
]

# Numerical Features to Preprocess
numeric_features = [
    'bal_insurance_21', 'bal_insurance_21_y', 'bal_insurance_21_x', 
    'bal_insurance_23', 'bal_insurance_23_y', 'bal_insurance_23_x',
    'cap_life_insurance_fixed_cap', 'cap_life_insurance_fixed_cap_y', 'cap_life_insurance_fixed_cap_x',
    'cap_life_insurance_decreasing_cap', 'cap_life_insurance_decreasing_cap_y', 'cap_life_insurance_decreasing_cap_x',
    'prem_fire_car_other_insurance', 'prem_fire_car_other_insurance_y', 'prem_fire_car_other_insurance_x',
    'bal_personal_loan', 'bal_personal_loan_y', 'bal_personal_loan_x',
    'bal_mortgage_loan', 'bal_mortgage_loan_y', 'bal_mortgage_loan_x',
    'bal_current_account', 'bal_current_account_y', 'bal_current_account_x',
    'bal_pension_saving', 'bal_pension_saving_y', 'bal_pension_saving_x', 
    'bal_savings_account', 'bal_savings_account_y', 'bal_savings_account_x',
    'bal_current_account_starter', 'bal_current_account_starter_y', 'bal_current_account_starter_x',
    'bal_savings_account_starter', 'bal_savings_account_starter_y', 'bal_savings_account_starter_x',
    'customer_since_all_x', 'customer_since_bank_x', 'customer_birth_date_x',
]

In [5]:
# Replace NA with column mean, normalize numerical values
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('normalize', Normalizer()),
#     ('scaler', StandardScaler())
])

# Replace NA with column mode, encode categorical value to 0/1
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

# Pre-process pipeline that drops unnecessary features, transforms numerical and categorical values
preprocessor = ColumnTransformer(#remainder='passthrough',
                                 transformers=[
                                     ('drop_columns', 'drop', drop_features),
                                     ('numeric', numeric_transformer, numeric_features),
                                     ('categorical', categorical_transformer, categorical_features)
                                ])

# Setting remainder=’passthrough’ will mean that all columns not specified in the list of “transformers” 
# will be passed through without transformation, instead of being dropped.

In [6]:
# CatBoost 
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
#     ('oversample', SMOTE(sampling_strategy=0.4)),
#     ('undersample', RandomUnderSampler(sampling_strategy=0.8)),
    ('oversample', RandomOverSampler(sampling_strategy=0.5)),
    ('catboost', CatBoostClassifier(verbose=False,random_state=0)),
])

pipeline.fit(X_train, y_train)

# Evaluate Model
y_pred = pipeline.predict(X_val)

# Top 250 Evaluation
print(f'Accuracy: {round(accuracy_score(y_val[:250], y_pred[:250]),4)}')
print(f'Recall: {round(recall_score(y_val[:250], y_pred[:250]),4)}')
print(f'ROC_AUC: {round(roc_auc_score(y_val[:250], y_pred[:250]),4)}')
print(f'Precision: {round(precision_score(y_val[:250], y_pred[:250]),4)}')

Accuracy: 0.968
Recall: 0.2
ROC_AUC: 0.5918
Precision: 0.2


In [7]:
#DecisionTree

pipeline_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('oversample', SMOTE(sampling_strategy=0.1)),
    ('undersample', RandomUnderSampler(sampling_strategy=0.5)),
    ('DecisionTree', DecisionTreeClassifier()),
])

pipeline_dt.fit(X_train, y_train)

# Evaluate Model
y_pred_dt = pipeline_dt.predict(X_val)

# Top 250 Evaluation
print(f'Accuracy: {round(accuracy_score(y_val[:250], y_pred_dt[:250]),4)}')
print(f'Recall: {round(recall_score(y_val[:250], y_pred_dt[:250]),4)}')
print(f'ROC_AUC: {round(roc_auc_score(y_val[:250], y_pred_dt[:250]),4)}')
print(f'Precision: {round(precision_score(y_val[:250], y_pred_dt[:250]),4)}')

Accuracy: 0.852
Recall: 0.0
ROC_AUC: 0.4347
Precision: 0.0


In [8]:
#Logistic

pipeline_log = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('oversample', SMOTE(sampling_strategy=0.1)),
    ('undersample', RandomUnderSampler(sampling_strategy=0.5)),
    ('Logistic', LogisticRegression(random_state=0)),
])

pipeline_log.fit(X_train, y_train)

# Evaluate Model
y_pred_log = pipeline_log.predict(X_val)

# Top 250 Evaluation
print(f'Accuracy: {round(accuracy_score(y_val[:250], y_pred_log[:250]),4)}')
print(f'Recall: {round(recall_score(y_val[:250], y_pred_log[:250]),4)}')
print(f'ROC_AUC: {round(roc_auc_score(y_val[:250], y_pred_log[:250]),4)}')
print(f'Precision: {round(precision_score(y_val[:250], y_pred_log[:250]),4)}')

Accuracy: 0.904
Recall: 0.2
ROC_AUC: 0.5592
Precision: 0.0476


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
