In [31]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/open-shopee-code-league-marketing-analytics/users.csv
/kaggle/input/open-shopee-code-league-marketing-analytics/sample_submission_0_1.csv
/kaggle/input/open-shopee-code-league-marketing-analytics/train.csv
/kaggle/input/open-shopee-code-league-marketing-analytics/test.csv


In [83]:
user_df = pd.read_csv('/kaggle/input/open-shopee-code-league-marketing-analytics/users.csv')
sample_submission_df = pd.read_csv('/kaggle/input/open-shopee-code-league-marketing-analytics/sample_submission_0_1.csv')
train_df = pd.read_csv('/kaggle/input/open-shopee-code-league-marketing-analytics/train.csv')
test_df = pd.read_csv('/kaggle/input/open-shopee-code-league-marketing-analytics/test.csv')

In [84]:
# Additional libraries
import re
import networkx
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, matthews_corrcoef, accuracy_score
from xgboost import XGBRFClassifier

# 1. Preprocessing
0. Utilities
1. User
2. Train
3. Test

## 1.0. Utilities

In [85]:
def is_never (string):
    if (re.search('Never',string)):
        return int(1)
    else: 
        return int(0)
    
def process_categorical(df, col_name, prefix):
    one_hot = pd.get_dummies(df[col_name],prefix=prefix,prefix_sep='_',dtype=int)
    return one_hot

def process_numerical_data(df, numerical_cols, drop_numerical_columns):
#     sacalar = MinMaxScaler()
    sacalar = StandardScaler() #if using StandardScaler
    scale_numerical_cols = list(set(numerical_cols)-set(drop_numerical_columns))
    df_numerical = sacalar.fit_transform(df[scale_numerical_cols])
    df_numerical = pd.DataFrame(df_numerical,columns=scale_numerical_cols)
    return df_numerical

def convert_rate10(value):
    return float(value/10)

def convert_rate30(value):
    return float(value/30)

def convert_rate60(value):
    return float(value/60)

def evaluate_model(model, X_valid, y_valid, metric=accuracy_score):
    y_pred = model.predict(X_valid)
    return metric(y_valid, y_pred)

## 1.1. User

In [86]:
user_df = pd.read_csv('/kaggle/input/open-shopee-code-league-marketing-analytics/users.csv')

In [87]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127886 entries, 0 to 127885
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  127886 non-null  int64  
 1   attr_1   78987 non-null   float64
 2   attr_2   127439 non-null  float64
 3   attr_3   127886 non-null  float64
 4   age      78987 non-null   float64
 5   domain   127886 non-null  object 
dtypes: float64(4), int64(1), object(1)
memory usage: 5.9+ MB


In [89]:
# attr_1 and attr_2: temporary drop
# user_df = user_df.drop(["attr_1","attr_2"],axis = 1)

# # attr_1
attr1_nan = user_df['attr_1'].isna()
user_df.loc[attr1_nan, 'attr_1'] = np.random.randint(2, size=attr1_nan.sum())
user_df['attr_1'] = user_df['attr_1'].astype('int64')
user_attr_1 = process_categorical(user_df, "attr_1", "attr_1")

# # attr_2
attr2_nan = user_df['attr_2'].isna()
user_df.loc[attr2_nan, 'attr_2'] = np.random.randint(2, size=attr2_nan.sum())
user_df['attr_2'] = user_df['attr_2'].astype('int64')
user_attr_2 = process_categorical(user_df, "attr_2", "attr_2")

# attr_3
user_attr_3 = process_categorical(user_df, "attr_3", "attr_3")

# age
age_nan = user_df['age'].isna()
user_df.loc[age_nan, 'age'] = np.round(np.random.uniform(15, 65, size=age_nan.sum()),0)

# domain
user_domain = process_categorical(user_df, "domain", "domain")

In [91]:
# merge into new user_df
new_user_df = pd.concat([
    user_df['user_id'],
    user_df['age'], 
#     user_attr_1, 
#     user_attr_2, 
    user_attr_3, 
    user_domain
                        ], axis=1)
new_user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127886 entries, 0 to 127885
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   user_id                 127886 non-null  int64  
 1   age                     127886 non-null  float64
 2   attr_3_0.0              127886 non-null  int64  
 3   attr_3_1.0              127886 non-null  int64  
 4   attr_3_2.0              127886 non-null  int64  
 5   attr_3_3.0              127886 non-null  int64  
 6   attr_3_4.0              127886 non-null  int64  
 7   domain_@163.com         127886 non-null  int64  
 8   domain_@gmail.com       127886 non-null  int64  
 9   domain_@hotmail.com     127886 non-null  int64  
 10  domain_@icloud.com      127886 non-null  int64  
 11  domain_@live.com        127886 non-null  int64  
 12  domain_@outlook.com     127886 non-null  int64  
 13  domain_@qq.com          127886 non-null  int64  
 14  domain_@rocketmail.c

## 1.2. Train

In [92]:
train_df = pd.read_csv('/kaggle/input/open-shopee-code-league-marketing-analytics/train.csv')

In [93]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73539 entries, 0 to 73538
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   country_code                 73539 non-null  int64 
 1   grass_date                   73539 non-null  object
 2   user_id                      73539 non-null  int64 
 3   subject_line_length          73539 non-null  int64 
 4   last_open_day                73539 non-null  object
 5   last_login_day               73539 non-null  object
 6   last_checkout_day            73539 non-null  object
 7   open_count_last_10_days      73539 non-null  int64 
 8   open_count_last_30_days      73539 non-null  int64 
 9   open_count_last_60_days      73539 non-null  int64 
 10  login_count_last_10_days     73539 non-null  int64 
 11  login_count_last_30_days     73539 non-null  int64 
 12  login_count_last_60_days     73539 non-null  int64 
 13  checkout_count_last_10_days  73

In [94]:
# country
train_df_country = process_categorical(train_df, "country_code", "country")

# convert grass_date and create email_sent_dayofweek
train_df['grass_date'] = pd.to_datetime(train_df['grass_date'])
train_df['email_sent_dayofweek'] = train_df['grass_date'].dt.dayofweek
train_df_weekday = process_categorical(train_df, "email_sent_dayofweek", "email_sent_weekday")

# never_open, never_login, never_checkout
train_df['never_open'] = train_df['last_open_day'].apply(is_never)
train_df['never_login'] = train_df['last_login_day'].apply(is_never)
train_df['never_checkout'] = train_df['last_checkout_day'].apply(is_never)

# replace never_open, never_login, never_checkout | last_open_day, last_login_day, last_checkout_day
train_df['last_open_day'] = train_df['last_open_day'].replace(['Never open'],'1600')
train_df['last_login_day'] = train_df['last_login_day'].replace(['Never login'],'36000')
train_df['last_checkout_day'] = train_df['last_checkout_day'].replace(['Never checkout'],'3000')

train_df['last_open_day'] = train_df['last_open_day'].astype('int64')
train_df['last_login_day'] = train_df['last_login_day'].astype('int64')
train_df['last_checkout_day'] = train_df['last_checkout_day'].astype('int64')

# open_count_last_10_days, open_count_last_30_days, open_count_last_60_days
train_df['open_count_rate_last_10_days'] = train_df['open_count_last_10_days'].apply(convert_rate10)
train_df['open_count_rate_last_30_days'] = train_df['open_count_last_30_days'].apply(convert_rate30)
train_df['open_count_rate_last_60_days'] = train_df['open_count_last_60_days'].apply(convert_rate60)

train_df['open_count_further_20_days'] = train_df['open_count_last_30_days'] - train_df['open_count_last_10_days']
train_df['open_count_further_30_days'] = train_df['open_count_last_60_days'] - train_df['open_count_last_30_days']

# login_count_last_10_days, login_count_last_30_days, login_count_last_60_days
train_df['login_count_rate_last_10_days'] = train_df['login_count_last_10_days'].apply(convert_rate10)
train_df['login_count_rate_last_30_days'] = train_df['login_count_last_30_days'].apply(convert_rate30)
train_df['login_count_rate_last_60_days'] = train_df['login_count_last_60_days'].apply(convert_rate60)

train_df['login_count_further_20_days'] = train_df['login_count_last_30_days'] - train_df['login_count_last_10_days']
train_df['login_count_further_30_days'] = train_df['login_count_last_60_days'] - train_df['login_count_last_30_days']

# checkout_count_last_10_days, checkout_count_last_30_days, checkout_count_last_60_days
train_df['checkout_count_rate_last_10_days'] = train_df['checkout_count_last_10_days'].apply(convert_rate10)
train_df['checkout_count_rate_last_30_days'] = train_df['checkout_count_last_30_days'].apply(convert_rate30)
train_df['checkout_count_rate_last_60_days'] = train_df['checkout_count_last_60_days'].apply(convert_rate60)

train_df['checkout_count_further_20_days'] = train_df['checkout_count_last_30_days'] - train_df['checkout_count_last_10_days']
train_df['checkout_count_further_30_days'] = train_df['checkout_count_last_60_days'] - train_df['checkout_count_last_30_days']

# row_id: drop
train_df = train_df.drop(["row_id"],axis = 1)

# open_flag: label
open_flag = train_df['open_flag']

In [95]:
_num = ['subject_line_length',
                'last_open_day', 'last_login_day', 'last_checkout_day',
                'open_count_last_10_days',#'open_count_last_30_days','open_count_last_60_days',
                'open_count_further_20_days','open_count_further_30_days',
                'login_count_last_10_days',#'login_count_last_30_days','login_count_last_60_days',
                'login_count_further_20_days','login_count_further_30_days',
                'checkout_count_last_10_days',#'checkout_count_last_30_days','checkout_count_last_60_days',
                'checkout_count_further_20_days','checkout_count_further_30_days',
               ]
drop_numerical_columns = []
train_df_numerical = process_numerical_data(train_df,_num,drop_numerical_columns)
# train_df_numerical

In [96]:
# merge with user_df
new_train_df = pd.concat([train_df_country, train_df_weekday, 
                          train_df[[
                              'user_id',
#                               'subject_line_length',
#                               'last_open_day','last_login_day','last_checkout_day',
                              'never_open','never_login','never_checkout',
                              'open_count_rate_last_10_days','open_count_rate_last_30_days','open_count_rate_last_60_days',
                              'login_count_rate_last_10_days','login_count_rate_last_30_days','login_count_rate_last_60_days',
                              'checkout_count_rate_last_10_days','checkout_count_rate_last_30_days','checkout_count_rate_last_60_days',
#                               'open_count_last_10_days','open_count_last_30_days','open_count_last_60_days',
#                               'open_count_further_20_days','open_count_further_30_days',
#                               'login_count_last_10_days','login_count_last_30_days','login_count_last_60_days',
#                               'login_count_further_20_days','login_count_further_30_days',
#                               'checkout_count_last_10_days','checkout_count_last_30_days','checkout_count_last_60_days',
#                               'checkout_count_further_20_days','checkout_count_further_30_days',
                                   ]],
                          train_df_numerical
                         ], axis=1)
new_train_df = new_train_df.merge(new_user_df, how='left', left_on='user_id', right_on='user_id')
new_train_df = new_train_df.drop(["user_id"], axis=1)
new_train_df = pd.concat([new_train_df, train_df['open_flag']], axis=1)
new_train_df = new_train_df.dropna(axis=0)

In [97]:
new_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73539 entries, 0 to 73538
Data columns (total 57 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   country_1                         73539 non-null  int64  
 1   country_2                         73539 non-null  int64  
 2   country_3                         73539 non-null  int64  
 3   country_4                         73539 non-null  int64  
 4   country_5                         73539 non-null  int64  
 5   country_6                         73539 non-null  int64  
 6   country_7                         73539 non-null  int64  
 7   email_sent_weekday_0              73539 non-null  int64  
 8   email_sent_weekday_1              73539 non-null  int64  
 9   email_sent_weekday_2              73539 non-null  int64  
 10  email_sent_weekday_3              73539 non-null  int64  
 11  email_sent_weekday_4              73539 non-null  int64  
 12  emai

In [98]:
print(new_train_df.open_flag.value_counts())

0    62083
1    11456
Name: open_flag, dtype: int64


In [99]:
# Divide by class
df_class_0 = new_train_df[new_train_df['open_flag'] == 0]
df_class_1 = new_train_df[new_train_df['open_flag'] == 1]

# Upsampling
class_1_upsampling = df_class_1.sample(20000, replace=True)
balanced_train_df = pd.concat([df_class_0, df_class_1, class_1_upsampling], axis=0)

In [100]:
train_features = new_train_df.iloc[:, :-1]
train_labels = new_train_df.iloc[:, -1]

## 1.3. Test

In [101]:
test_df = pd.read_csv('/kaggle/input/open-shopee-code-league-marketing-analytics/test.csv')

In [102]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55970 entries, 0 to 55969
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   country_code                 55970 non-null  int64 
 1   grass_date                   55970 non-null  object
 2   user_id                      55970 non-null  int64 
 3   subject_line_length          55970 non-null  int64 
 4   last_open_day                55970 non-null  object
 5   last_login_day               55970 non-null  object
 6   last_checkout_day            55970 non-null  object
 7   open_count_last_10_days      55970 non-null  int64 
 8   open_count_last_30_days      55970 non-null  int64 
 9   open_count_last_60_days      55970 non-null  int64 
 10  login_count_last_10_days     55970 non-null  int64 
 11  login_count_last_30_days     55970 non-null  int64 
 12  login_count_last_60_days     55970 non-null  int64 
 13  checkout_count_last_10_days  55

In [103]:
# country
test_df_country = process_categorical(test_df, "country_code", "country")

# convert grass_date and create email_sent_dayofweek
test_df['grass_date'] = pd.to_datetime(test_df['grass_date'])
test_df['email_sent_dayofweek'] = test_df['grass_date'].dt.dayofweek
test_df_weekday = process_categorical(test_df, "email_sent_dayofweek", "email_sent_weekday")

# never_open, never_login, never_checkout
test_df['never_open'] = test_df['last_open_day'].apply(is_never)
test_df['never_login'] = test_df['last_login_day'].apply(is_never)
test_df['never_checkout'] = test_df['last_checkout_day'].apply(is_never)

# replace never_open, never_login, never_checkout | last_open_day, last_login_day, last_checkout_day
test_df['last_open_day'] = test_df['last_open_day'].replace(['Never open'],'1600')
test_df['last_login_day'] = test_df['last_login_day'].replace(['Never login'],'36000')
test_df['last_checkout_day'] = test_df['last_checkout_day'].replace(['Never checkout'],'3000')

test_df['last_open_day'] = test_df['last_open_day'].astype('int64')
test_df['last_login_day'] = test_df['last_login_day'].astype('int64')
test_df['last_checkout_day'] = test_df['last_checkout_day'].astype('int64')

# open_count_last_10_days, open_count_last_30_days, open_count_last_60_days
test_df['open_count_rate_last_10_days'] = test_df['open_count_last_10_days'].apply(convert_rate10)
test_df['open_count_rate_last_30_days'] = test_df['open_count_last_30_days'].apply(convert_rate30)
test_df['open_count_rate_last_60_days'] = test_df['open_count_last_60_days'].apply(convert_rate60)

test_df['open_count_further_20_days'] = test_df['open_count_last_30_days'] - test_df['open_count_last_10_days']
test_df['open_count_further_30_days'] = test_df['open_count_last_60_days'] - test_df['open_count_last_30_days']

# login_count_last_10_days, login_count_last_30_days, login_count_last_60_days
test_df['login_count_rate_last_10_days'] = test_df['login_count_last_10_days'].apply(convert_rate10)
test_df['login_count_rate_last_30_days'] = test_df['login_count_last_30_days'].apply(convert_rate30)
test_df['login_count_rate_last_60_days'] = test_df['login_count_last_60_days'].apply(convert_rate60)

test_df['login_count_further_20_days'] = test_df['login_count_last_30_days'] - test_df['login_count_last_10_days']
test_df['login_count_further_30_days'] = test_df['login_count_last_60_days'] - test_df['login_count_last_30_days']

# checkout_count_last_10_days, checkout_count_last_30_days, checkout_count_last_60_days
test_df['checkout_count_rate_last_10_days'] = test_df['checkout_count_last_10_days'].apply(convert_rate10)
test_df['checkout_count_rate_last_30_days'] = test_df['checkout_count_last_30_days'].apply(convert_rate30)
test_df['checkout_count_rate_last_60_days'] = test_df['checkout_count_last_60_days'].apply(convert_rate60)

test_df['checkout_count_further_20_days'] = test_df['checkout_count_last_30_days'] - test_df['checkout_count_last_10_days']
test_df['checkout_count_further_30_days'] = test_df['checkout_count_last_60_days'] - test_df['checkout_count_last_30_days']

# row_id: drop
test_df = test_df.drop(["row_id"],axis = 1)

In [104]:
_num = ['subject_line_length',
                'last_open_day', 'last_login_day', 'last_checkout_day',
                'open_count_last_10_days',#'open_count_last_30_days','open_count_last_60_days',
                'open_count_further_20_days','open_count_further_30_days',
                'login_count_last_10_days',#'login_count_last_30_days','login_count_last_60_days',
                'login_count_further_20_days','login_count_further_30_days',
                'checkout_count_last_10_days',#'checkout_count_last_30_days','checkout_count_last_60_days',
                'checkout_count_further_20_days','checkout_count_further_30_days',
               ]
drop_numerical_columns = []
test_df_numerical = process_numerical_data(test_df,_num,drop_numerical_columns)

In [105]:
# merge with user_df
new_test_df = pd.concat([test_df_country, test_df_weekday,
                         test_df[[
                             'user_id',
#                              'subject_line_length',
#                              'last_open_day','last_login_day','last_checkout_day',
                             'never_open','never_login','never_checkout',
                             'open_count_rate_last_10_days','open_count_rate_last_30_days','open_count_rate_last_60_days',
                             'login_count_rate_last_10_days','login_count_rate_last_30_days','login_count_rate_last_60_days',
                             'checkout_count_rate_last_10_days','checkout_count_rate_last_30_days','checkout_count_rate_last_60_days',
#                              'open_count_last_10_days','open_count_last_30_days','open_count_last_60_days',
#                              'open_count_further_20_days','open_count_further_30_days',
#                              'login_count_last_10_days','login_count_last_30_days','login_count_last_60_days',
#                              'login_count_further_20_days','login_count_further_30_days',
#                              'checkout_count_last_10_days','checkout_count_last_30_days','checkout_count_last_60_days',
#                              'checkout_count_further_20_days','checkout_count_further_30_days',
                                 ]],
                         test_df_numerical
                         ], axis=1)
new_test_df = new_test_df.merge(new_user_df, how='left', left_on='user_id', right_on='user_id')
new_test_df = new_test_df.drop(["user_id"], axis=1)

In [106]:
test_x = new_test_df
test_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55970 entries, 0 to 55969
Data columns (total 56 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   country_1                         55970 non-null  int64  
 1   country_2                         55970 non-null  int64  
 2   country_3                         55970 non-null  int64  
 3   country_4                         55970 non-null  int64  
 4   country_5                         55970 non-null  int64  
 5   country_6                         55970 non-null  int64  
 6   country_7                         55970 non-null  int64  
 7   email_sent_weekday_0              55970 non-null  int64  
 8   email_sent_weekday_1              55970 non-null  int64  
 9   email_sent_weekday_2              55970 non-null  int64  
 10  email_sent_weekday_3              55970 non-null  int64  
 11  email_sent_weekday_4              55970 non-null  int64  
 12  emai

# 2. Confirmation
* Check all input feature for training and testing dataset
* Split train - validation set

In [108]:
# split the data into train and validation set
train_x, valid_x, train_y, valid_y = train_test_split(train_features,train_labels,test_size=0.2,random_state=27,shuffle=True)

# cross-validation using kfold
# kf = KFold(n_splits=10)

## Apply PCA

In [109]:
from sklearn.decomposition import PCA
pca = PCA(n_components=25)
train_x_pca = pd.DataFrame(pca.fit_transform(train_x))
valid_x_pca = pd.DataFrame(pca.transform(valid_x))
test_x_pca = pd.DataFrame(pca.transform(test_x))

# 3. Modelling

In [112]:
# Random Forest
rf_clf = RandomForestClassifier(max_features='auto', max_depth=20, random_state=27)
rf_clf.fit(train_x_pca, train_y)

y_true, y_pred = valid_y , rf_clf.predict(valid_x_pca)
matthews_score = evaluate_model(rf_clf, valid_x_pca, valid_y, metric=matthews_corrcoef)
print('Results on the test set:')
print(f'MCC Score = {matthews_score}')
print(classification_report(y_true, y_pred))

Results on the test set:
MCC Score = 0.49795656689008744
              precision    recall  f1-score   support

           0       0.90      0.97      0.93     12449
           1       0.70      0.44      0.54      2259

    accuracy                           0.89     14708
   macro avg       0.80      0.70      0.74     14708
weighted avg       0.87      0.89      0.87     14708



In [111]:
# Random Forest
rf_clf2 = RandomForestClassifier(max_features='auto', max_depth=25, random_state=27)
rf_clf2.fit(train_x, train_y)

y_true, y_pred = valid_y , rf_clf2.predict(valid_x)
matthews_score = evaluate_model(rf_clf2, valid_x, valid_y, metric=matthews_corrcoef)
print('Results on the test set:')
print(f'MCC Score = {matthews_score}')
print(classification_report(y_true, y_pred))

Results on the test set:
MCC Score = 0.5168542194824178
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     12449
           1       0.72      0.46      0.56      2259

    accuracy                           0.89     14708
   macro avg       0.82      0.71      0.75     14708
weighted avg       0.88      0.89      0.88     14708



In [113]:
# XGBoost
import xgboost as xgb
xgb_model1 = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model1.fit(train_x, train_y)#, early_stopping_rounds=5, eval_set=[(test_x, test_y)])

y_true, y_pred = valid_y , xgb_model1.predict(valid_x)
matthews_score = evaluate_model(xgb_model1, valid_x, valid_y, metric=matthews_corrcoef)
print('Results on the test set:')
print(f'MCC Score = {matthews_score}')
print(classification_report(y_true, y_pred))

Results on the test set:
MCC Score = 0.514058784352614
              precision    recall  f1-score   support

           0       0.91      0.96      0.94     12449
           1       0.70      0.47      0.56      2259

    accuracy                           0.89     14708
   macro avg       0.80      0.72      0.75     14708
weighted avg       0.88      0.89      0.88     14708



# 4. Add prediction as new column

In [74]:
train_x_pca['predicted'] = rf_clf.predict(train_x_pca)
valid_x_pca['predicted'] = rf_clf.predict(valid_x_pca)
test_x_pca['predicted'] = rf_clf.predict(test_x_pca)

In [115]:
train_x['predicted'] = rf_clf2.predict(train_x)
valid_x['predicted'] = rf_clf2.predict(valid_x)
test_x['predicted'] = rf_clf2.predict(test_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# 5. Modelling 2

In [None]:
# Bagging
bg_clf = BaggingClassifier(base_estimator=SVC(), n_estimators=20, random_state=10, max_features=10)
bg_clf.fit(train_x, train_y)

y_true, y_pred = test_y , bg_clf.predict(test_x)
matthews_score = evaluate_model(bg_clf, test_x, test_y, metric=matthews_corrcoef)
print('Results on the test set:')
print(f'MCC Score = {matthews_score}')
print(classification_report(y_true, y_pred))

In [None]:
# AdaBoost
ad_clf = AdaBoostClassifier(
    base_estimator=RandomForestClassifier(max_features='auto', max_depth=25, random_state=10), 
    n_estimators=50, 
    random_state=10, 
    learning_rate=0.1
)
ad_clf.fit(train_x, train_y)

y_true, y_pred = test_y , ad_clf.predict(test_x)
matthews_score = evaluate_model(ad_clf, test_x, test_y, metric=matthews_corrcoef)
print('Results on the test set:')
print(f'MCC Score = {matthews_score}')
print(classification_report(y_true, y_pred))

In [82]:
# MLP
input_size = train_x.shape[1]
NN_sgd = MLPClassifier(
    hidden_layer_sizes=(256, 512, 256,),
    solver='sgd',
    learning_rate='adaptive',
    momentum=0.8,
    max_iter=1000, 
    shuffle=True, 
    random_state=27,
    early_stopping=True,
    verbose=True)
NN_sgd.fit(train_x_pca,train_y)

y_true, y_pred = test_y , NN_sgd.predict(valid_x_pca)
matthews_score = evaluate_model(NN_sgd, valid_x_pca, valid_y, metric=matthews_corrcoef)
print('Results on the test set:')
print(f'MCC Score = {matthews_score}')
print(classification_report(y_true, y_pred))

Iteration 1, loss = 0.43221590
Validation score: 0.853501
Iteration 2, loss = 0.34700339
Validation score: 0.886642
Iteration 3, loss = 0.29780979
Validation score: 0.896669
Iteration 4, loss = 0.26358889
Validation score: 0.904827
Iteration 5, loss = 0.23320742
Validation score: 0.915194
Iteration 6, loss = 0.20165501
Validation score: 0.930659
Iteration 7, loss = 0.16948258
Validation score: 0.953093
Iteration 8, loss = 0.13851559
Validation score: 0.966859
Iteration 9, loss = 0.11190216
Validation score: 0.981985
Iteration 10, loss = 0.09193430
Validation score: 0.989803
Iteration 11, loss = 0.07837688
Validation score: 0.991672
Iteration 12, loss = 0.06955971
Validation score: 0.991842
Iteration 13, loss = 0.06366150
Validation score: 0.991842
Iteration 14, loss = 0.05973683
Validation score: 0.992012
Iteration 15, loss = 0.05679630
Validation score: 0.992012
Iteration 16, loss = 0.05482305
Validation score: 0.992012
Iteration 17, loss = 0.05327670
Validation score: 0.992012
Iterat

In [81]:
# MLP adam
input_size = train_x.shape[1]
NN_adam = MLPClassifier(
    hidden_layer_sizes=(256, 512, 256,),
    solver='adam',
    learning_rate='adaptive',
    learning_rate_init=0.01,
    max_iter=1000, 
    shuffle=True, 
    random_state=27,
    early_stopping=True,
    verbose=True)
NN_adam.fit(train_x_pca,train_y)

y_true, y_pred = test_y , NN_adam.predict(valid_x_pca)
matthews_score = evaluate_model(NN_adam, valid_x_pca, valid_y, metric=matthews_corrcoef)
print('Results on the test set:')
print(f'MCC Score = {matthews_score}')
print(classification_report(y_true, y_pred))

Iteration 1, loss = 0.10649851
Validation score: 0.990313
Iteration 2, loss = 0.04987715
Validation score: 0.992182
Iteration 3, loss = 0.04639496
Validation score: 0.991162
Iteration 4, loss = 0.04427040
Validation score: 0.992182
Iteration 5, loss = 0.04475644
Validation score: 0.992182
Iteration 6, loss = 0.04341453
Validation score: 0.992182
Iteration 7, loss = 0.04791464
Validation score: 0.990653
Iteration 8, loss = 0.04438403
Validation score: 0.992182
Iteration 9, loss = 0.04249351
Validation score: 0.988103
Iteration 10, loss = 0.04773133
Validation score: 0.990653
Iteration 11, loss = 0.05160993
Validation score: 0.989633
Iteration 12, loss = 0.04653751
Validation score: 0.990993
Iteration 13, loss = 0.04600788
Validation score: 0.990993
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Results on the test set:
MCC Score = 0.5040949991584631
              precision    recall  f1-score   support

           0       0.91      0.97     

In [119]:
# XGBoost
import xgboost as xgb
xgb_model2 = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model2.fit(train_x, train_y)#, early_stopping_rounds=5, eval_set=[(test_x, test_y)])

y_true, y_pred = valid_y , xgb_model2.predict(valid_x)
matthews_score = evaluate_model(xgb_model2, valid_x, valid_y, metric=matthews_corrcoef)
print('Results on the test set:')
print(f'MCC Score = {matthews_score}')
print(classification_report(y_true, y_pred))

Results on the test set:
MCC Score = 0.5165938924054588
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     12449
           1       0.72      0.46      0.56      2259

    accuracy                           0.89     14708
   macro avg       0.82      0.71      0.75     14708
weighted avg       0.88      0.89      0.88     14708



In [120]:
# Random Forest
rf_clf3 = RandomForestClassifier(max_features='auto', max_depth=None, random_state=27)
rf_clf3.fit(train_x, train_y)

y_true, y_pred = valid_y , rf_clf3.predict(valid_x)
matthews_score = evaluate_model(rf_clf3, valid_x, valid_y, metric=matthews_corrcoef)
print('Results on the test set:')
print(f'MCC Score = {matthews_score}')
print(classification_report(y_true, y_pred))

Results on the test set:
MCC Score = 0.5165938924054588
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     12449
           1       0.72      0.46      0.56      2259

    accuracy                           0.89     14708
   macro avg       0.82      0.71      0.75     14708
weighted avg       0.88      0.89      0.88     14708



# 6. Prediction

In [122]:
# Random forest
test_y_pred = xgb_model2.predict(test_x)
df_result = pd.DataFrame.from_dict(dict({'row_id':list(sample_submission_df['row_id']),
                                         'open_flag':test_y_pred}))
df_result

Unnamed: 0,row_id,open_flag
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
55965,55965,0
55966,55966,0
55967,55967,0
55968,55968,0


In [123]:
df_result['open_flag'].value_counts()

0    51048
1     4922
Name: open_flag, dtype: int64

In [124]:
df_result.to_csv('submission_xgb_model2.csv', index=False)