In [1]:
# Get the training data
# Clean/preprocess/transform the data
# Train a machine learning model
# Evaluate and optimise the model
# Clean/preprocess/transform new data
# Fit the model on new data to make predictions.

In [2]:
import pandas as pd
import numpy as np
import load_data
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from catboost import CatBoostClassifier

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [3]:
#Load Data
mypath = "../data/"
mydata = load_data.get_file_names(mypath)
data = load_data.load_copy_data(mydata, mypath)

data = data['data_merged']

file name: train_month_1
file name: data_merged
file name: train_month_2
file name: test_month_1
file name: test_month_3
file name: test_month_2
file name: train_month_3_with_target


In [4]:
#data.dtypes
# data.isnull().sum()
load_data.calc_missing(data)

Unnamed: 0,Missing_Number,Missing_Percent
customer_education_y,47125,0.739831
customer_education,47125,0.739831
customer_education_x,47125,0.739831
customer_children,23364,0.366799
customer_children_y,23065,0.362105
customer_children_x,23056,0.361964
customer_relationship,14899,0.233904
customer_relationship_y,14476,0.227263
customer_relationship_x,14456,0.226949
customer_occupation_code_y,2002,0.03143


In [5]:
#Data preprocess that can be done on the whole dataset

#Drop customer_education, customer_education_x, customer_education_y
df = data.drop(['customer_education_y', 'customer_education', 'customer_education_x'], axis=1)

#Manipulate dates to represent number of years
list_dates = ['customer_since_all', 'customer_since_all_x', 'customer_since_all_y', 
            'customer_since_bank', 'customer_since_bank_x', 'customer_since_bank_y',
            'customer_birth_date', 'customer_birth_date_x', 'customer_birth_date_y']

#Convert date columns into datetime format
df['base_dt'] = pd.to_datetime('2018-01-01')
df[list_dates] = df[list_dates].apply(pd.to_datetime)

for col in list_dates:
    df[col] = (df['base_dt'] - df[col]).dt.days

#Drop columns (base_dt)
df = df.drop('base_dt', axis=1)

#Manipulate customer children to have category yes or no [nan, 'yes', 'no', 'mature', 'young', 'onebaby', 'adolescent', 'preschool', 'grownup']
#train.customer_children_x.unique()
for column in ['customer_children', 'customer_children_y', 'customer_children_x']:
   df[column].replace(['mature', 'young', 'onebaby', 'adolescent', 'preschool', 'grownup'], 'yes', inplace=True)
   df[column].fillna('no', inplace=True)
    
#Customer Children should only have categories "yes" or "no"
# df.customer_children_x.unique()

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,client_id,homebanking_active_x,has_homebanking_x,has_insurance_21_x,has_insurance_23_x,has_life_insurance_fixed_cap_x,has_life_insurance_decreasing_cap_x,has_fire_car_other_insurance_x,has_personal_loan_x,has_mortgage_loan_x,has_current_account_x,has_pension_saving_x,has_savings_account_x,has_savings_account_starter_x,has_current_account_starter_x,bal_insurance_21_x,bal_insurance_23_x,cap_life_insurance_fixed_cap_x,cap_life_insurance_decreasing_cap_x,prem_fire_car_other_insurance_x,bal_personal_loan_x,bal_mortgage_loan_x,bal_current_account_x,bal_pension_saving_x,bal_savings_account_x,bal_savings_account_starter_x,bal_current_account_starter_x,visits_distinct_so_x,visits_distinct_so_areas_x,customer_since_all_x,customer_since_bank_x,customer_gender_x,customer_birth_date_x,customer_postal_code_x,customer_occupation_code_x,customer_self_employed_x,customer_children_x,customer_relationship_x,homebanking_active_y,has_homebanking_y,has_insurance_21_y,has_insurance_23_y,has_life_insurance_fixed_cap_y,has_life_insurance_decreasing_cap_y,has_fire_car_other_insurance_y,has_personal_loan_y,has_mortgage_loan_y,has_current_account_y,has_pension_saving_y,has_savings_account_y,has_savings_account_starter_y,has_current_account_starter_y,bal_insurance_21_y,bal_insurance_23_y,cap_life_insurance_fixed_cap_y,cap_life_insurance_decreasing_cap_y,prem_fire_car_other_insurance_y,bal_personal_loan_y,bal_mortgage_loan_y,bal_current_account_y,bal_pension_saving_y,bal_savings_account_y,bal_savings_account_starter_y,bal_current_account_starter_y,visits_distinct_so_y,visits_distinct_so_areas_y,customer_since_all_y,customer_since_bank_y,customer_gender_y,customer_birth_date_y,customer_postal_code_y,customer_occupation_code_y,customer_self_employed_y,customer_children_y,customer_relationship_y,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_since_all,customer_since_bank,customer_gender,customer_birth_date,customer_postal_code,customer_occupation_code,customer_self_employed,customer_children,customer_relationship,target
0,0,910df42ad36243aa4ce16324cd7b15b0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,20,0,0,960,0,20000,0,0,1.0,1.0,12725.0,8554.0,1,27151,3630,9.0,0,no,,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,20,0,0,2110,0,20000,0,0,1.0,1.0,12725.0,8554.0,1,27151,3630,9.0,0,no,,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,20,0,0,590,0,22000,0,0,1.0,1.0,12725.0,8554.0,1,27151,3630,9.0,0,no,,0
1,1,4e19dc3a54323c5bbfc374664b950cd1,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,380,0,10290,0,0,1.0,1.0,365.0,365.0,1,8735,2460,9.0,0,yes,couple,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1060,0,10290,0,0,1.0,1.0,365.0,365.0,1,8735,2460,9.0,0,yes,couple,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,940,0,10570,0,0,1.0,1.0,365.0,365.0,1,8735,2460,9.0,0,yes,couple,0
2,2,f5d08db1b86c0cb0f566bf446cff1fb4,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,320,0,0,790,0,16640,0,0,1.0,1.0,13545.0,13545.0,2,29677,2660,9.0,0,no,single,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,320,0,0,1220,0,16000,0,0,1.0,1.0,13545.0,13545.0,2,29677,2660,9.0,0,no,single,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,320,0,0,1210,0,15200,0,0,1.0,1.0,13545.0,13545.0,2,29677,2660,9.0,0,no,single,0
3,3,26170ecf63653e215c52f4262c1c4859,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,160,0,0,0,0,29020,0,0,1.0,1.0,7093.0,1553.0,1,26055,6600,9.0,0,no,,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,160,0,0,0,0,29020,0,0,1.0,1.0,7093.0,1553.0,1,26055,6600,9.0,0,no,,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,160,0,0,0,0,29020,0,0,1.0,1.0,7093.0,1553.0,1,26055,6600,9.0,0,no,,0
4,4,c078009957dffb64f20e61b41220a976,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,13450,0,0,1.0,1.0,1887.0,1887.0,2,7945,8550,9.0,0,yes,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,13550,0,0,1.0,1.0,1887.0,1887.0,2,7945,8550,9.0,0,yes,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,13650,0,0,1.0,1.0,1887.0,1887.0,2,7945,8550,9.0,0,yes,couple,1


In [6]:
#Construct test, train set
X = df.drop('target',axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [7]:
# numeric_transformer = Pipeline(steps=[
#        ('imputer', SimpleImputer(strategy='mean'))
#       ,('scaler', StandardScaler())
# ])

categorical_transformer = Pipeline(steps=[
    ('imputer_mode', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

In [13]:
from sklearn.preprocessing import Normalizer

In [16]:
categorical_features = ['customer_relationship', 'customer_relationship_y', 'customer_relationship_x',
                        'customer_occupation_code', 'customer_occupation_code_y', 'customer_occupation_code_x',
                        'customer_since_bank', 'customer_since_bank_y', 'customer_since_bank_x',
                        'customer_since_all', 'customer_since_all_y', 'customer_since_all_x']

preprocessor = ColumnTransformer(
   transformers=[
#     ('numeric', numeric_transformer, numeric_features),
       ('categorical', categorical_transformer, categorical_features)
]) 

catboostclassifier = CatBoostClassifier(verbose=False,random_state=0,scale_pos_weight=5)

# from sklearn.ensemble import RandomForestRegressor
pipe = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('normalizer', Normalizer()),
    ('catboostclassifier', catboostclassifier)
    ])

In [None]:
# pipeline.fit(X_train)
# pipeline.transform(X_train)

# np.unique(test)



In [18]:
X_train

Unnamed: 0.1,Unnamed: 0,client_id,homebanking_active_x,has_homebanking_x,has_insurance_21_x,has_insurance_23_x,has_life_insurance_fixed_cap_x,has_life_insurance_decreasing_cap_x,has_fire_car_other_insurance_x,has_personal_loan_x,has_mortgage_loan_x,has_current_account_x,has_pension_saving_x,has_savings_account_x,has_savings_account_starter_x,has_current_account_starter_x,bal_insurance_21_x,bal_insurance_23_x,cap_life_insurance_fixed_cap_x,cap_life_insurance_decreasing_cap_x,prem_fire_car_other_insurance_x,bal_personal_loan_x,bal_mortgage_loan_x,bal_current_account_x,bal_pension_saving_x,bal_savings_account_x,bal_savings_account_starter_x,bal_current_account_starter_x,visits_distinct_so_x,visits_distinct_so_areas_x,customer_since_all_x,customer_since_bank_x,customer_gender_x,customer_birth_date_x,customer_postal_code_x,customer_occupation_code_x,customer_self_employed_x,customer_children_x,customer_relationship_x,homebanking_active_y,has_homebanking_y,has_insurance_21_y,has_insurance_23_y,has_life_insurance_fixed_cap_y,has_life_insurance_decreasing_cap_y,has_fire_car_other_insurance_y,has_personal_loan_y,has_mortgage_loan_y,has_current_account_y,has_pension_saving_y,has_savings_account_y,has_savings_account_starter_y,has_current_account_starter_y,bal_insurance_21_y,bal_insurance_23_y,cap_life_insurance_fixed_cap_y,cap_life_insurance_decreasing_cap_y,prem_fire_car_other_insurance_y,bal_personal_loan_y,bal_mortgage_loan_y,bal_current_account_y,bal_pension_saving_y,bal_savings_account_y,bal_savings_account_starter_y,bal_current_account_starter_y,visits_distinct_so_y,visits_distinct_so_areas_y,customer_since_all_y,customer_since_bank_y,customer_gender_y,customer_birth_date_y,customer_postal_code_y,customer_occupation_code_y,customer_self_employed_y,customer_children_y,customer_relationship_y,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_since_all,customer_since_bank,customer_gender,customer_birth_date,customer_postal_code,customer_occupation_code,customer_self_employed,customer_children,customer_relationship
54252,54252,3d2f122a6116abf7b6bebaa7267b411a,0,0,1,0,0,0,1,0,0,0,0,1,0,0,7040,0,0,0,470,0,0,0,0,9560,0,0,1.0,1.0,8585.0,8585.0,1,21033,8900,9.0,0,no,couple,0,0,1,0,0,0,1,0,0,0,0,1,0,0,7060,0,0,0,470,0,0,0,0,9560,0,0,1.0,1.0,8585.0,8585.0,1,21033,8900,9.0,0,no,couple,0,0,1,0,0,0,1,0,0,0,0,1,0,0,7070,0,0,0,470,0,0,0,0,9560,0,0,1.0,1.0,8585.0,8585.0,1,21033,8900,9.0,0,no,couple
25386,25386,746d6debfe5078479b535f04ffa7633e,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,7390,0,0,1.0,1.0,3379.0,3379.0,2,13698,8750,9.0,0,yes,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,7390,0,0,1.0,1.0,3379.0,3379.0,2,13698,8750,9.0,0,yes,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,7390,0,0,1.0,1.0,3379.0,3379.0,2,13698,8750,9.0,0,no,
63582,63582,dddf3a22010bda2235cc67edaa97a0b0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2730,0,0,0,0,0,0,0,0,11880,0,0,2.0,1.0,8311.0,2557.0,2,21033,3120,4.0,1,yes,couple,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2730,0,0,0,0,0,0,0,0,11880,0,0,2.0,1.0,8311.0,2557.0,2,21033,3120,4.0,1,yes,couple,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2730,0,0,0,0,0,0,0,0,11880,0,0,2.0,1.0,8311.0,2557.0,2,21033,3120,4.0,1,yes,couple
29683,29683,d145c43f9157e844c1e401347bddc0d9,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,9000,0,0,0,9000,1.0,1.0,365.0,365.0,1,15981,6780,9.0,0,no,single,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,9000,0,0,0,9000,1.0,1.0,365.0,365.0,1,15981,6780,9.0,0,no,single,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,9000,0,0,0,9000,1.0,1.0,365.0,365.0,1,15981,6780,9.0,0,no,single
34535,34535,523bc609bb94be51126f84f8d0ed8112,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,2490,0,5320,0,0,1.0,1.0,10137.0,10137.0,2,30285,9100,9.0,0,no,,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,2060,0,5320,0,0,1.0,1.0,10137.0,10137.0,2,30285,9100,9.0,0,no,,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,2170,0,5320,0,0,1.0,1.0,10137.0,10137.0,2,30285,9100,9.0,0,no,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61404,61404,b733ba51aa0aab4df704acfdc824de27,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,25280,0,0,1.0,1.0,13545.0,13545.0,2,29342,9031,9.0,0,no,single,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,25280,0,0,1.0,1.0,13545.0,13545.0,2,29342,9031,9.0,0,no,single,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,25280,0,0,1.0,1.0,13545.0,13545.0,2,29342,9031,9.0,0,no,single
17730,17730,4db70268fd7e3f7b3ce4254e56d7d349,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,25790,0,0,1.0,1.0,3806.0,3806.0,2,16346,5081,9.0,0,yes,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,25790,0,0,1.0,1.0,3806.0,3806.0,2,16346,5081,9.0,0,yes,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,25790,0,0,1.0,1.0,3806.0,3806.0,2,16346,5081,9.0,0,yes,couple
28030,28030,412c281f70885eed8030c85eb3b42604,1,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1200,0,0,980,0,5000,0,980,1.0,1.0,7154.0,3714.0,2,23498,1450,9.0,0,no,,1,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1200,0,0,510,0,5850,0,510,1.0,1.0,7154.0,3714.0,2,23498,1450,9.0,0,no,,1,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1230,0,0,1010,0,4400,0,1010,1.0,1.0,7154.0,3714.0,2,23498,1450,9.0,0,no,
15725,15725,e9931a3bd1d2f8f75eff51a23191a581,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1630,0,4670,0,0,1.0,1.0,9376.0,9376.0,2,27700,1083,9.0,0,no,,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1210,0,4670,0,0,1.0,1.0,9376.0,9376.0,2,27700,1083,9.0,0,no,,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1650,0,4670,0,0,1.0,1.0,9376.0,9376.0,2,27700,1083,9.0,0,no,


In [17]:
pipe = pipe.fit(X_train, y_train)

# pipe.steps[0][1].mean_

print('Testing score: ', pipe.score(X_test, y_test))

ValueError: Found unknown categories [17320.0] in column 6 during transform

In [None]:
n_features_to_test = np.arange(1, 11)

alpha_to_test = 2.0**np.arange(-6, +6)

params = {'reduce_dim__n_components': n_features_to_test,
          'regressor__alpha': alpha_to_test,
         'scaler' : [StandardScaler(), RobustScaler()]}

from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

gridsearch.best_params_