In [8]:
import pandas as pd
import numpy as np
import load_data

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,recall_score,confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc, plot_confusion_matrix,plot_roc_curve
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import Libraries for Modelling
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [10]:
# Predict using test data
#Load Data
mypath = "../data/"
mydata = load_data.get_file_names(mypath)
data_files = load_data.load_copy_data(mydata, mypath)

test_1 = data_files['test_month_1']
test_2 = data_files['test_month_2']
test_3 = data_files['test_month_3']

X_train = data_files['X_train']
y_train = data_files['y_train']

file name: train_month_1
file name: data_merged
file name: test_merged
file name: train_month_2
file name: X_test_preprocessed
file name: client_ids
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


In [4]:
# 1. Merge data (should be 39+38+39 columns = 118?, rows = 63697) with client_id as key
df_merge = test_1.merge(test_2, on=['client_id'], how='outer')
df_merge_2 = df_merge.merge(test_3, on=['client_id'], how='outer')

print(f"Shape of dataframe: {df_merge_2.shape}")

#Export merged data
df_merge_2.to_csv(mypath + 'test_merged.csv', encoding='utf-8', index=False)

Shape of dataframe: (27300, 115)


In [5]:
# 3. load preprocessed test data
mydata = load_data.get_file_names(mypath)
data_files = load_data.load_copy_data(mydata, mypath)
test_data = data_files['X_test_preprocessed'].copy()

file name: train_month_1
file name: data_merged
file name: test_merged
file name: train_month_2
file name: X_test_preprocessed
file name: client_ids
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


In [34]:
# 4. Predict

categorical_features = [
    'homebanking_active_x', 'homebanking_active_y', 'homebanking_active',
    'has_homebanking_x', 'has_homebanking_y', 'has_homebanking',
    'has_insurance_21_x', 'has_insurance_21_y', 'has_insurance_21',
    'has_insurance_23_x', 'has_insurance_23_y', 'has_insurance_23',
    'has_life_insurance_fixed_cap_x', 'has_life_insurance_fixed_cap_y', 'has_life_insurance_fixed_cap',
    'has_life_insurance_decreasing_cap_x', 'has_life_insurance_decreasing_cap_y', 'has_life_insurance_decreasing_cap',
    'has_fire_car_other_insurance_x', 'has_fire_car_other_insurance_y', 'has_fire_car_other_insurance',
    'has_personal_loan_x', 'has_personal_loan_y', 'has_personal_loan',
    'has_mortgage_loan_x', 'has_mortgage_loan_y', 'has_mortgage_loan',
    'has_current_account_x', 'has_current_account_y', 'has_current_account',
    'has_pension_saving_x', 'has_pension_saving_y', 'has_pension_saving',
    'has_savings_account_x', 'has_savings_account_y', 'has_savings_account',
    'has_savings_account_starter_x', 'has_savings_account_starter',
    'has_current_account_starter_x', 'has_current_account_starter_y', 'has_current_account_starter',
    'visits_distinct_so_x', 'visits_distinct_so_y', 'visits_distinct_so',
    'visits_distinct_so_areas_x', 'visits_distinct_so_areas_y', 'visits_distinct_so_areas',
    'customer_gender_x',
    'customer_occupation_code_x',
    'customer_self_employed_x', 'customer_self_employed_y', 'customer_self_employed',
    'customer_education_x',
    'customer_children_x', 'customer_children_y', 'customer_children',
    'customer_relationship_x', 'customer_relationship_y', 'customer_relationship',
    'area_cat',
]

# Numerical Features to Preprocess
numeric_features = [
    'bal_insurance_21_x', 'bal_insurance_21_y', 'bal_insurance_21', 
    'bal_insurance_23_x', 'bal_insurance_23_y', 'bal_insurance_23',
    'cap_life_insurance_fixed_cap_x', 'cap_life_insurance_fixed_cap_y', 'cap_life_insurance_fixed_cap',
    'cap_life_insurance_decreasing_cap_x', 'cap_life_insurance_decreasing_cap_y', 'cap_life_insurance_decreasing_cap',
    'prem_fire_car_other_insurance_x', 'prem_fire_car_other_insurance_y', 'prem_fire_car_other_insurance',
    'bal_personal_loan_x', 'bal_personal_loan_y', 'bal_personal_loan',
    'bal_mortgage_loan_x', 'bal_mortgage_loan_y', 'bal_mortgage_loan',
    'bal_current_account_x', 'bal_current_account_y', 'bal_current_account',
    'bal_pension_saving_x', 'bal_pension_saving_y', 'bal_pension_saving',
    'bal_savings_account_x', 'bal_savings_account_y', 'bal_savings_account', 
    'bal_savings_account_starter_x', 'bal_savings_account_starter_y', 'bal_savings_account_starter',
    'bal_current_account_starter_x', 'bal_current_account_starter_y', 'bal_current_account_starter',    
    'customer_since_all_x', 
    'customer_since_bank_x',
    'customer_birth_date_x',
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('normalize', Normalizer()),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(remainder='passthrough',
                                 transformers=[
                                     ('numeric', numeric_transformer, numeric_features),
                                     ('categorical', categorical_transformer, categorical_features)
                                ])

# CatBoost 
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('oversample', RandomOverSampler(sampling_strategy=0.8)),
    ('catboost', CatBoostClassifier(verbose=False,random_state=0)),
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict_proba(test_data)

In [36]:
test_data.head()

Unnamed: 0,client_id,homebanking_active_x,has_homebanking_x,has_insurance_21_x,has_insurance_23_x,has_life_insurance_fixed_cap_x,has_life_insurance_decreasing_cap_x,has_fire_car_other_insurance_x,has_personal_loan_x,has_mortgage_loan_x,has_current_account_x,has_pension_saving_x,has_savings_account_x,has_savings_account_starter_x,has_current_account_starter_x,bal_insurance_21_x,bal_insurance_23_x,cap_life_insurance_fixed_cap_x,cap_life_insurance_decreasing_cap_x,prem_fire_car_other_insurance_x,bal_personal_loan_x,bal_mortgage_loan_x,bal_current_account_x,bal_pension_saving_x,bal_savings_account_x,bal_savings_account_starter_x,bal_current_account_starter_x,visits_distinct_so_x,visits_distinct_so_areas_x,customer_since_all_x,customer_since_bank_x,customer_gender_x,customer_birth_date_x,customer_postal_code_x,customer_occupation_code_x,customer_self_employed_x,customer_education_x,customer_children_x,customer_relationship_x,homebanking_active_y,has_homebanking_y,has_insurance_21_y,has_insurance_23_y,has_life_insurance_fixed_cap_y,has_life_insurance_decreasing_cap_y,has_fire_car_other_insurance_y,has_personal_loan_y,has_mortgage_loan_y,has_current_account_y,has_pension_saving_y,has_savings_account_y,has_savings_account_starter_y,has_current_account_starter_y,bal_insurance_21_y,bal_insurance_23_y,cap_life_insurance_fixed_cap_y,cap_life_insurance_decreasing_cap_y,prem_fire_car_other_insurance_y,bal_personal_loan_y,bal_mortgage_loan_y,bal_current_account_y,bal_pension_saving_y,bal_savings_account_y,bal_savings_account_starter_y,bal_current_account_starter_y,visits_distinct_so_y,visits_distinct_so_areas_y,customer_since_all_y,customer_since_bank_y,customer_gender_y,customer_birth_date_y,customer_postal_code_y,customer_occupation_code_y,customer_self_employed_y,customer_education_y,customer_children_y,customer_relationship_y,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_since_all,customer_since_bank,customer_gender,customer_birth_date,customer_postal_code,customer_occupation_code,customer_self_employed,customer_education,customer_children,customer_relationship,area_cat,bal_insurance_21_1,bal_insurance_21_2,bal_insurance_21_3,bal_insurance_23_1,bal_insurance_23_2,bal_insurance_23_3,bal_personal_loan_1,bal_personal_loan_2,bal_personal_loan_3,bal_mortgage_loan_1,bal_mortgage_loan_2,bal_mortgage_loan_3,bal_current_account_1,bal_current_account_2,bal_current_account_3,bal_pension_saving_1,bal_pension_saving_2,bal_pension_saving_3,bal_savings_account_1,bal_savings_account_2,bal_savings_account_3
0,ccf4cd93d5c32cd8a59809d54b4d53ac,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,800,0,6450,0,0,1.0,1.0,37.0,37.0,2,81,9860,9.0,0,5.0,Unknown,single,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1260,0,5190,0,0,1.0,1.0,1981-01,1981-01,2,1937-01,9860,9.0,0,5.0,Unknown,single,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1590,0,5190,0,0,1.0,1.0,1981-01,1981-01,2,1937-01,9860,9.0,0,5.0,Unknown,single,9000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,460,330,790,0,0,0,-1260,0,-1260
1,56605a660d18549592653ff6941186f1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,310,0,0,30,0,15010,0,0,1.0,1.0,25.0,25.0,2,77,2491,9.0,0,Unknown,Unknown,single,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,310,0,0,30,0,15010,0,0,1.0,1.0,1993-03,1993-03,2,1941-04,2491,9.0,0,,Unknown,single,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,310,0,0,0,0,15010,0,0,1.0,1.0,1993-03,1993-03,2,1941-04,2491,9.0,0,,Unknown,single,2000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,-30,-30,0,0,0,0,0,0
2,bda5f84c05e5695a7ec10550b457890f,0,0,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,87500,1240,0,154570,3300,0,3700,0,0,1.0,1.0,25.0,13.0,1,49,1770,9.0,0,3.0,mature,couple,0,0,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,87500,1250,0,153920,3290,0,3700,0,0,1.0,1.0,1993-08,2005-06,1,1969-04,1770,9.0,0,3.0,mature,couple,0,0,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,87500,1250,0,153260,4200,0,3700,0,0,1.0,1.0,1993-08,2005-06,1,1969-04,1770,9.0,0,3.0,mature,couple,1000_area_code,0,0,0,0,0,0,0,0,0,-650,-660,-1310,-10,910,900,0,0,0,0,0,0
3,a2f1c04bc3acf2222e658a897400798f,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,110,0,0,3040,0,27500,0,0,1.0,1.0,40.0,9.0,1,66,7750,9.0,0,Unknown,no,couple,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,110,0,0,3520,0,27500,0,0,1.0,1.0,1978-08,2009-01,1,1952-09,7750,9.0,0,,no,couple,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,110,0,0,3960,0,27500,0,0,1.0,1.0,1978-08,2009-01,1,1952-09,7750,9.0,0,,no,couple,7000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,480,440,920,0,0,0,0,0,0
4,e83aadc3b0d25dbc12a35551afa25807,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,10190,0,0,1.0,1.0,4.0,4.0,2,56,2270,9.0,1,Unknown,grownup,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,10190,0,0,1.0,1.0,2014-04,2014-04,2,1962-08,2270,9.0,1,,grownup,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,10190,0,0,1.0,1.0,2014-04,2014-04,2,1962-08,2270,9.0,1,,grownup,couple,2000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [35]:
y_pred

array([[0.92108669, 0.07891331],
       [0.86309771, 0.13690229],
       [0.36851214, 0.63148786],
       ...,
       [0.968255  , 0.031745  ],
       [0.91534558, 0.08465442],
       [0.94891368, 0.05108632]])

In [37]:
# edit df to make ID, prob columns 
prob = [item[1] for item in y_pred]
ids = test_data['client_id'].to_list()
test_prediction = pd.DataFrame(list(zip(ids, prob)))

test_prediction.columns = ['ID', 'PROB']
test_prediction.head()
# test_prediction.rename(columns={"A": "a", "B": "c"})

Unnamed: 0,ID,PROB
0,ccf4cd93d5c32cd8a59809d54b4d53ac,0.078913
1,56605a660d18549592653ff6941186f1,0.136902
2,bda5f84c05e5695a7ec10550b457890f,0.631488
3,a2f1c04bc3acf2222e658a897400798f,0.118954
4,e83aadc3b0d25dbc12a35551afa25807,0.176451


In [38]:
# Export prediction data

test_prediction.to_csv(mypath + 'group_22.csv', encoding='utf-8', index=False)