In [1]:
import pandas as pd
import numpy as np
import load_data
from ipynb.fs.full.Get_Base_Data_00 import Time

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,recall_score,confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc, plot_confusion_matrix,plot_roc_curve
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import Libraries for Modelling
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [2]:
def evaluate_250(y_val, y_pred):
    # Top 250 Evaluation
    print(f'Accuracy: {round(accuracy_score(y_val[:250], y_pred[:250]),4)}')
    print(f'Recall: {round(recall_score(y_val[:250], y_pred[:250]),4)}')
    print(f'ROC_AUC: {round(roc_auc_score(y_val[:250], y_pred[:250]),4)}')
    print(f'Precision: {round(precision_score(y_val[:250], y_pred[:250]),4)}')

In [9]:
def main():
    #Load Data
    mypath = "../data/"
    mydata = load_data.get_file_names(mypath)
    data_files = load_data.load_copy_data(mydata, mypath)

    data = data_files['data_merged']
    X_train = data_files['X_train'].copy()
    X_val = data_files['X_val'].copy()
    y_train = data_files['y_train']['target'].copy()
    y_val = data_files['y_val']['target'].copy()
    
    
    # Features to preprocess

    # Categorical Features to Preprocess
    categorical_features = [
        'homebanking_active_m1', 'homebanking_active_m2', 'homebanking_active',
        'has_homebanking_m1', 'has_homebanking_m2', 'has_homebanking',
        'has_insurance_21_m1', 'has_insurance_21_m2', 'has_insurance_21',
        'has_insurance_23_m1', 'has_insurance_23_m2', 'has_insurance_23',
        'has_life_insurance_fixed_cap_m1', 'has_life_insurance_fixed_cap_m2', 'has_life_insurance_fixed_cap',
        'has_life_insurance_decreasing_cap_m1', 'has_life_insurance_decreasing_cap_m2', 'has_life_insurance_decreasing_cap',
        'has_fire_car_other_insurance_m1', 'has_fire_car_other_insurance_m2', 'has_fire_car_other_insurance',
        'has_personal_loan_m1', 'has_personal_loan_m2', 'has_personal_loan',
        'has_mortgage_loan_m1', 'has_mortgage_loan_m2', 'has_mortgage_loan',
        'has_current_account_m1', 'has_current_account_m2', 'has_current_account',
        'has_pension_saving_m1', 'has_pension_saving_m2', 'has_pension_saving',
        'has_savings_account_m1', 'has_savings_account_m2', 'has_savings_account',
        'has_savings_account_starter_m1', 'has_savings_account_starter',
        'has_current_account_starter_m1', 'has_current_account_starter_m2', 'has_current_account_starter',
        'visits_distinct_so_m1', 'visits_distinct_so_m2', 'visits_distinct_so',
        'visits_distinct_so_areas_m1', 'visits_distinct_so_areas_m2', 'visits_distinct_so_areas',
        'customer_gender_m1',
    #     'customer_postal_code_x', drop this, causes error...
        'customer_occupation_code_m1',
        'customer_self_employed_m1', 'customer_self_employed_m2', 'customer_self_employed',
        'customer_education_m1',
        'customer_children_m1', 'customer_children_m2', 'customer_children',
        'customer_relationship_m1', 'customer_relationship_m2', 'customer_relationship',
        'area_cat',
    ]

    # Numerical Features to Preprocess
    numeric_features = [
        'bal_insurance_21_m1', 'bal_insurance_21_m2', 'bal_insurance_21', 
        'bal_insurance_23_m1', 'bal_insurance_23_m2', 'bal_insurance_23',
        'cap_life_insurance_fixed_cap_m1', 'cap_life_insurance_fixed_cap_m2', 'cap_life_insurance_fixed_cap',
        'cap_life_insurance_decreasing_cap_m1', 'cap_life_insurance_decreasing_cap_m2', 'cap_life_insurance_decreasing_cap',
        'prem_fire_car_other_insurance_m1', 'prem_fire_car_other_insurance_m2', 'prem_fire_car_other_insurance',
        'bal_personal_loan_m1', 'bal_personal_loan_m2', 'bal_personal_loan',
        'bal_mortgage_loan_m1', 'bal_mortgage_loan_m2', 'bal_mortgage_loan',
        'bal_current_account_m1', 'bal_current_account_m2', 'bal_current_account',
        'bal_pension_saving_m1', 'bal_pension_saving_m2', 'bal_pension_saving',
        'bal_savings_account_m1', 'bal_savings_account_m2', 'bal_savings_account', 
        'bal_savings_account_starter_m1', 'bal_savings_account_starter_m2', 'bal_savings_account_starter',
        'bal_current_account_starter_m1', 'bal_current_account_starter_m2', 'bal_current_account_starter',    
        'customer_since_all_m1', 
        'customer_since_bank_m1',
        'customer_birth_date_m1',
    ]

    # Check if including any non-existent columns
    for col in categorical_features:
        if col in X_train.columns:
            pass
        else:
            print(f'missing: {col}')

    
    # Replace NA with column mean, normalize numerical values
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('normalize', Normalizer()),
    #     ('scaler', StandardScaler())
    ])

    # Replace NA with column mode, encode categorical value to 0/1
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder())
    ])

    # Pre-process pipeline that drops unnecessary features, transforms numerical and categorical values
    preprocessor = ColumnTransformer(#remainder='passthrough',
                                     transformers=[
                                         ('numeric', numeric_transformer, numeric_features),
                                         ('categorical', categorical_transformer, categorical_features)
                                    ])

    # Setting remainder=’passthrough’ will mean that all columns not specified in the list of “transformers” 
    # will be passed through without transformation, instead of being dropped.
    
    
    # CatBoost 
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
    #     ('oversample', SMOTE(sampling_strategy=0.4)),
    #     ('undersample', RandomUnderSampler(sampling_strategy=0.8)),
        ('oversample', RandomOverSampler(sampling_strategy=0.8)),
        ('catboost', CatBoostClassifier(verbose=False,random_state=0)),
    ])

    pipeline.fit(X_train, y_train)

    # Evaluate Model
    y_pred = pipeline.predict(X_val)  
    evaluate_250(y_val, y_pred)

In [10]:
if __name__ == '__main__':
    main()

file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target
Accuracy: 0.956
Recall: 0.2
ROC_AUC: 0.5857
Precision: 0.125


In [None]:
# #DecisionTree

# pipeline_dt = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('oversample', SMOTE(sampling_strategy=0.1)),
#     ('undersample', RandomUnderSampler(sampling_strategy=0.5)),
#     ('DecisionTree', DecisionTreeClassifier()),
# ])

# pipeline_dt.fit(X_train, y_train)

# # Evaluate Model
# y_pred_dt = pipeline_dt.predict(X_val)

# # Top 250 Evaluation
# print(f'Accuracy: {round(accuracy_score(y_val[:250], y_pred_dt[:250]),4)}')
# print(f'Recall: {round(recall_score(y_val[:250], y_pred_dt[:250]),4)}')
# print(f'ROC_AUC: {round(roc_auc_score(y_val[:250], y_pred_dt[:250]),4)}')
# print(f'Precision: {round(precision_score(y_val[:250], y_pred_dt[:250]),4)}')

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier(n_estimators=100, max_depth=40,
#                                min_samples_leaf=50,
#                                n_jobs=-1, class_weight='balanced',
#                                random_state=50)

# pipeline_rf = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('oversample', RandomOverSampler(sampling_strategy=0.5)),
#     ('RandomForest', model),
# ])

# pipeline_log.fit(X_train, y_train)

# # Evaluate Model
# y_pred_log = pipeline_log.predict(X_val)

# # Top 250 Evaluation
# print(f'Accuracy: {round(accuracy_score(y_val[:250], y_pred_log[:250]),4)}')
# print(f'Recall: {round(recall_score(y_val[:250], y_pred_log[:250]),4)}')
# print(f'ROC_AUC: {round(roc_auc_score(y_val[:250], y_pred_log[:250]),4)}')
# print(f'Precision: {round(precision_score(y_val[:250], y_pred_log[:250]),4)}')

In [None]:
# #Logistic

# pipeline_log = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('oversample', SMOTE(sampling_strategy=0.1)),
#     ('undersample', RandomUnderSampler(sampling_strategy=0.5)),
#     ('Logistic', LogisticRegression(random_state=0)),
# ])

# pipeline_log.fit(X_train, y_train)

# # Evaluate Model
# y_pred_log = pipeline_log.predict(X_val)

# # Top 250 Evaluation
# print(f'Accuracy: {round(accuracy_score(y_val[:250], y_pred_log[:250]),4)}')
# print(f'Recall: {round(recall_score(y_val[:250], y_pred_log[:250]),4)}')
# print(f'ROC_AUC: {round(roc_auc_score(y_val[:250], y_pred_log[:250]),4)}')
# print(f'Precision: {round(precision_score(y_val[:250], y_pred_log[:250]),4)}')

In [None]:
# # https://www.kaggle.com/code/ritesh2000/hyperparameter-tuning-using-pipelines-optuna/notebook

# pipelines = [pipeline, pipeline_dt, pipeline_log, pipeline_rf]

# best_accuracy=0.0
# best_classifier=0
# best_pipeline=""

# pipe_dict={0:'CatBoost',1:'Decision Tree',2:'Logistic Regression',3:'Random Forest'}

# # Fit the pipelines
# for pipe in pipelines: 
#     pipe.fit(X_train, y_train)


# for i,model in enumerate(pipelines):
#     if model.score(X_val,y_val)>best_accuracy:
#         best_accuracy=model.score(X_val,y_val)
#         best_pipeline=model
#         best_classifier=i
# print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

# from sklearn.model_selection import GridSearchCV,RandomizedSearchCV


# pipe = Pipeline([('classifier',RandomForestClassifier())])
# # You can also the other classifier like by creating another dictionary inside grid_params.
# grid_params = [{'classifier':[RandomForestClassifier()],
#                 'classifier__n_estimators' : [10,20,30,40,50,60,70,80],
#                 'classifier__criterion' : ["gini","entropy"],
#                 'classifier__max_depth' : [2,4,6,8,10],
#                 'classifier__min_samples_split':[5,4,6,7,8],
#                 'classifier__max_features':['auto', 'sqrt', 'log2']
#                 }]

# grid_search = GridSearchCV(pipe, grid_params, cv=5, verbose=0,n_jobs=-1,scoring="roc_auc")

# grid_search.fit(X_train,y_train)
# best_accuracy = grid_search.best_score_
# best_parameters = grid_search.best_params_
# print("Best ROC_AUC: {:.2f} %".format(best_accuracy*100))
# print("Best Parameters:", best_parameters)
