In [1]:
import pandas as pd
import numpy as np
import load_data

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,recall_score,confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc, plot_confusion_matrix,plot_roc_curve
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import Libraries for Modelling
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# from file import function
from ipynb.fs.full.Get_Base_Data_00 import Time, merge_df
from ipynb.fs.full.Manual_Preprocess_02 import *

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [2]:
# Predict using test data

#Load Data
mypath = "../data/"
mydata = load_data.get_file_names(mypath)
data_files = load_data.load_copy_data(mydata, mypath)

X_train = data_files['X_train']
y_train = data_files['y_train']

#Merge data with client_id as key
df_merged = merge_df(data_files['test_month_1'], data_files['test_month_2'], 'client_id', 'outer', ('_m1', '_m2'))
df_merged = merge_df(df_merged, data_files['test_month_3'], 'client_id', 'outer')

#Export merged data
print(f'Export data to {mypath} as test_merged.csv')
df_merged.to_csv(mypath + 'test_merged.csv', encoding='utf-8', index=False)

file name: train_month_1
file name: data_merged
file name: test_merged
file name: train_month_2
file name: X_test_preprocessed
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target
Shape of dataframe: (27300, 77)
Shape of dataframe: (27300, 115)
Export data to ../data/ as test_merged.csv


In [3]:
#Preprocess data
mydata = load_data.get_file_names(mypath)
X_test = data_files['test_merged'].copy()
# client_ids = X_test['client_id']
### Pre-process test data ###

#1. Change dates to inbetween years
list_dates = ['customer_since_all_m1', 'customer_since_bank_m1', 'customer_birth_date_m1']
X_test = dates_to_days(X_test, list_dates, base_date = '2018-01-01')

#2. Categorize Area Code by 1000s
X_test = bin_area_code(X_test)

#3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
list_balances = [['bal_insurance_21_m1', 'bal_insurance_21_m2', 'bal_insurance_21'],
                 ['bal_insurance_23_m1', 'bal_insurance_23_m2', 'bal_insurance_23'],
                 ['bal_personal_loan_m1', 'bal_personal_loan_m2', 'bal_personal_loan'],
                 ['bal_mortgage_loan_m1', 'bal_mortgage_loan_m2', 'bal_mortgage_loan'],
                 ['bal_current_account_m1', 'bal_current_account_m2', 'bal_current_account'],
                 ['bal_pension_saving_m1', 'bal_pension_saving_m2', 'bal_pension_saving'],
                 ['bal_savings_account_m1', 'bal_savings_account_m2', 'bal_savings_account'],
                ]
X_test = get_differences(X_test, list_balances)

#4. Change NA to category
col_list = ['customer_education_m1', 
                'customer_children_m1', 'customer_children_m2', 'customer_children',
                'customer_relationship_m1', 'customer_relationship_m2', 'customer_relationship',
               ]
X_test = categorize_na(X_test, col_list)

# drop 'customer_postal_code_x'
X_test = X_test.drop('customer_postal_code_m1', axis=1)

# Export data
X_test.to_csv(mypath + 'X_test_preprocessed.csv', encoding='utf-8',index=False)
# client_ids.to_csv(mypath + 'client_ids.csv', encoding='utf-8',index=False)

file name: train_month_1
file name: data_merged
file name: test_merged
file name: train_month_2
file name: X_test_preprocessed
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


In [4]:
# 3. load preprocessed test data
mydata = load_data.get_file_names(mypath)
data_files = load_data.load_copy_data(mydata, mypath)
test_data = data_files['X_test_preprocessed'].copy()

file name: train_month_1
file name: data_merged
file name: test_merged
file name: train_month_2
file name: X_test_preprocessed
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


In [5]:
test_data.head()

Unnamed: 0,client_id,homebanking_active_m1,has_homebanking_m1,has_insurance_21_m1,has_insurance_23_m1,has_life_insurance_fixed_cap_m1,has_life_insurance_decreasing_cap_m1,has_fire_car_other_insurance_m1,has_personal_loan_m1,has_mortgage_loan_m1,has_current_account_m1,has_pension_saving_m1,has_savings_account_m1,has_savings_account_starter_m1,has_current_account_starter_m1,bal_insurance_21_m1,bal_insurance_23_m1,cap_life_insurance_fixed_cap_m1,cap_life_insurance_decreasing_cap_m1,prem_fire_car_other_insurance_m1,bal_personal_loan_m1,bal_mortgage_loan_m1,bal_current_account_m1,bal_pension_saving_m1,bal_savings_account_m1,bal_savings_account_starter_m1,bal_current_account_starter_m1,visits_distinct_so_m1,visits_distinct_so_areas_m1,customer_since_all_m1,customer_since_bank_m1,customer_gender_m1,customer_birth_date_m1,customer_occupation_code_m1,customer_self_employed_m1,customer_education_m1,customer_children_m1,customer_relationship_m1,homebanking_active_m2,has_homebanking_m2,has_insurance_21_m2,has_insurance_23_m2,has_life_insurance_fixed_cap_m2,has_life_insurance_decreasing_cap_m2,has_fire_car_other_insurance_m2,has_personal_loan_m2,has_mortgage_loan_m2,has_current_account_m2,has_pension_saving_m2,has_savings_account_m2,has_savings_account_starter_m2,has_current_account_starter_m2,bal_insurance_21_m2,bal_insurance_23_m2,cap_life_insurance_fixed_cap_m2,cap_life_insurance_decreasing_cap_m2,prem_fire_car_other_insurance_m2,bal_personal_loan_m2,bal_mortgage_loan_m2,bal_current_account_m2,bal_pension_saving_m2,bal_savings_account_m2,bal_savings_account_starter_m2,bal_current_account_starter_m2,visits_distinct_so_m2,visits_distinct_so_areas_m2,customer_since_all_m2,customer_since_bank_m2,customer_gender_m2,customer_birth_date_m2,customer_postal_code_m2,customer_occupation_code_m2,customer_self_employed_m2,customer_education_m2,customer_children_m2,customer_relationship_m2,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_since_all,customer_since_bank,customer_gender,customer_birth_date,customer_postal_code,customer_occupation_code,customer_self_employed,customer_education,customer_children,customer_relationship,area_cat,bal_insurance_21_1,bal_insurance_21_2,bal_insurance_21_3,bal_insurance_23_1,bal_insurance_23_2,bal_insurance_23_3,bal_personal_loan_1,bal_personal_loan_2,bal_personal_loan_3,bal_mortgage_loan_1,bal_mortgage_loan_2,bal_mortgage_loan_3,bal_current_account_1,bal_current_account_2,bal_current_account_3,bal_pension_saving_1,bal_pension_saving_2,bal_pension_saving_3,bal_savings_account_1,bal_savings_account_2,bal_savings_account_3
0,ccf4cd93d5c32cd8a59809d54b4d53ac,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,800,0,6450,0,0,1.0,1.0,37.0,37.0,2,81,9.0,0,5.0,Unknown,single,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1260,0,5190,0,0,1.0,1.0,1981-01,1981-01,2,1937-01,9860,9.0,0,5.0,Unknown,single,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1590,0,5190,0,0,1.0,1.0,1981-01,1981-01,2,1937-01,9860,9.0,0,5.0,Unknown,single,9000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,460,330,790,0,0,0,-1260,0,-1260
1,56605a660d18549592653ff6941186f1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,310,0,0,30,0,15010,0,0,1.0,1.0,25.0,25.0,2,77,9.0,0,Unknown,Unknown,single,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,310,0,0,30,0,15010,0,0,1.0,1.0,1993-03,1993-03,2,1941-04,2491,9.0,0,,Unknown,single,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,310,0,0,0,0,15010,0,0,1.0,1.0,1993-03,1993-03,2,1941-04,2491,9.0,0,,Unknown,single,2000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,-30,-30,0,0,0,0,0,0
2,bda5f84c05e5695a7ec10550b457890f,0,0,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,87500,1240,0,154570,3300,0,3700,0,0,1.0,1.0,25.0,13.0,1,49,9.0,0,3.0,mature,couple,0,0,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,87500,1250,0,153920,3290,0,3700,0,0,1.0,1.0,1993-08,2005-06,1,1969-04,1770,9.0,0,3.0,mature,couple,0,0,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,87500,1250,0,153260,4200,0,3700,0,0,1.0,1.0,1993-08,2005-06,1,1969-04,1770,9.0,0,3.0,mature,couple,1000_area_code,0,0,0,0,0,0,0,0,0,-650,-660,-1310,-10,910,900,0,0,0,0,0,0
3,a2f1c04bc3acf2222e658a897400798f,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,110,0,0,3040,0,27500,0,0,1.0,1.0,40.0,9.0,1,66,9.0,0,Unknown,no,couple,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,110,0,0,3520,0,27500,0,0,1.0,1.0,1978-08,2009-01,1,1952-09,7750,9.0,0,,no,couple,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,110,0,0,3960,0,27500,0,0,1.0,1.0,1978-08,2009-01,1,1952-09,7750,9.0,0,,no,couple,7000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,480,440,920,0,0,0,0,0,0
4,e83aadc3b0d25dbc12a35551afa25807,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,10190,0,0,1.0,1.0,4.0,4.0,2,56,9.0,1,Unknown,grownup,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,10190,0,0,1.0,1.0,2014-04,2014-04,2,1962-08,2270,9.0,1,,grownup,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,10190,0,0,1.0,1.0,2014-04,2014-04,2,1962-08,2270,9.0,1,,grownup,couple,2000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# 4. Predict

# Categorical Features to Preprocess
categorical_features = [
    'homebanking_active_m1', 'homebanking_active_m2', 'homebanking_active',
    'has_homebanking_m1', 'has_homebanking_m2', 'has_homebanking',
    'has_insurance_21_m1', 'has_insurance_21_m2', 'has_insurance_21',
    'has_insurance_23_m1', 'has_insurance_23_m2', 'has_insurance_23',
    'has_life_insurance_fixed_cap_m1', 'has_life_insurance_fixed_cap_m2', 'has_life_insurance_fixed_cap',
    'has_life_insurance_decreasing_cap_m1', 'has_life_insurance_decreasing_cap_m2', 'has_life_insurance_decreasing_cap',
    'has_fire_car_other_insurance_m1', 'has_fire_car_other_insurance_m2', 'has_fire_car_other_insurance',
    'has_personal_loan_m1', 'has_personal_loan_m2', 'has_personal_loan',
    'has_mortgage_loan_m1', 'has_mortgage_loan_m2', 'has_mortgage_loan',
    'has_current_account_m1', 'has_current_account_m2', 'has_current_account',
    'has_pension_saving_m1', 'has_pension_saving_m2', 'has_pension_saving',
    'has_savings_account_m1', 'has_savings_account_m2', 'has_savings_account',
    'has_savings_account_starter_m1', 'has_savings_account_starter',
    'has_current_account_starter_m1', 'has_current_account_starter_m2', 'has_current_account_starter',
    'visits_distinct_so_m1', 'visits_distinct_so_m2', 'visits_distinct_so',
    'visits_distinct_so_areas_m1', 'visits_distinct_so_areas_m2', 'visits_distinct_so_areas',
    'customer_gender_m1',
#     'customer_postal_code_x', drop this, causes error...
    'customer_occupation_code_m1',
    'customer_self_employed_m1', 'customer_self_employed_m2', 'customer_self_employed',
    'customer_education_m1',
    'customer_children_m1', 'customer_children_m2', 'customer_children',
    'customer_relationship_m1', 'customer_relationship_m2', 'customer_relationship',
    'area_cat',
]

# Numerical Features to Preprocess
numeric_features = [
    'bal_insurance_21_m1', 'bal_insurance_21_m2', 'bal_insurance_21', 
    'bal_insurance_23_m1', 'bal_insurance_23_m2', 'bal_insurance_23',
    'cap_life_insurance_fixed_cap_m1', 'cap_life_insurance_fixed_cap_m2', 'cap_life_insurance_fixed_cap',
    'cap_life_insurance_decreasing_cap_m1', 'cap_life_insurance_decreasing_cap_m2', 'cap_life_insurance_decreasing_cap',
    'prem_fire_car_other_insurance_m1', 'prem_fire_car_other_insurance_m2', 'prem_fire_car_other_insurance',
    'bal_personal_loan_m1', 'bal_personal_loan_m2', 'bal_personal_loan',
    'bal_mortgage_loan_m1', 'bal_mortgage_loan_m2', 'bal_mortgage_loan',
    'bal_current_account_m1', 'bal_current_account_m2', 'bal_current_account',
    'bal_pension_saving_m1', 'bal_pension_saving_m2', 'bal_pension_saving',
    'bal_savings_account_m1', 'bal_savings_account_m2', 'bal_savings_account', 
    'bal_savings_account_starter_m1', 'bal_savings_account_starter_m2', 'bal_savings_account_starter',
    'bal_current_account_starter_m1', 'bal_current_account_starter_m2', 'bal_current_account_starter',    
    'customer_since_all_m1', 
    'customer_since_bank_m1',
    'customer_birth_date_m1',
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('normalize', Normalizer()),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(transformers=[
                                     ('numeric', numeric_transformer, numeric_features),
                                     ('categorical', categorical_transformer, categorical_features)
                                ])

# CatBoost 
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('oversample', RandomOverSampler(sampling_strategy=0.8)),
    ('catboost', CatBoostClassifier(verbose=False,random_state=0)),
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict_proba(test_data)

In [7]:
# edit df to make ID, prob columns 
prob = [item[1] for item in y_pred]
ids = test_data['client_id'].to_list()
test_prediction = pd.DataFrame(list(zip(ids, prob)))

test_prediction.columns = ['ID', 'PROB']
test_prediction.head()

Unnamed: 0,ID,PROB
0,ccf4cd93d5c32cd8a59809d54b4d53ac,0.093436
1,56605a660d18549592653ff6941186f1,0.139643
2,bda5f84c05e5695a7ec10550b457890f,0.740022
3,a2f1c04bc3acf2222e658a897400798f,0.142468
4,e83aadc3b0d25dbc12a35551afa25807,0.138434


In [8]:
# Export prediction data
test_prediction.to_csv(mypath + 'group_22_v01.csv', encoding='utf-8', index=False)