In [157]:
#import all the required libraries

import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
import random
import plotly.express as px
from imblearn.over_sampling import SMOTE
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint 

from sklearn.model_selection import GridSearchCV 
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

from sklearn.metrics import average_precision_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
#uploading train and test file

train= pd.read_csv("https://raw.githubusercontent.com/iamyumang/CustomerSuccessBox/master/Train_Set.csv")     
test= pd.read_csv("https://raw.githubusercontent.com/iamyumang/CustomerSuccessBox/master/Test_Set.csv")

In [None]:
train.head()

In [None]:
#quick summary of train dataset
train.describe()

In [None]:
train['week_start_date'] =  pd.to_datetime(train['week_start_date'])


#count of unique values of account and date
print(train['week_start_date'].nunique())
print(train["account_id"].nunique())

In [None]:
#check duplicate rows
duplicateDFRow = train[train.duplicated()]
print(duplicateDFRow["Label"].value_counts())


In [None]:
#remove duplicates
train= train.drop_duplicates()
train.reset_index(drop= True, inplace= True)

In [None]:
#ratio of non-event to event
label_count= train["Label"].value_counts()
print('Proportion:', round(label_count[0] / label_count[1], 4), ': 1')
label_count.plot(kind='bar', title='Count (Label)');

In [155]:
#checking counts of all columns unique values
col= train.columns
col= col[2:64]
for i in col:
    print(train[i].nunique())
col= col.tolist()

52
28
123
7
191
26
19
178
550
162
1544
486
60
2


In [156]:
#Sorting account_ids by their count in decreasing order
#100583 has maximum no. of count

train= train.set_index(['account_id','week_start_date']).sort_index()
train.reset_index(inplace= True)
train= train.iloc[train.groupby('account_id').account_id.transform('size').mul(-1).argsort(kind='mergesort')]
train.reset_index(drop= True, inplace= True)
train.head()

KeyError: "None of ['account_id', 'week_start_date'] are in the columns"

In [None]:
#Calculating the counts for each account_id and save them into another column i.e. "count"
account_id_count = train.groupby('account_id')['Label'].agg(['sum','count'])
account_id_count.reset_index(drop= False, inplace= True)
account_id_count.head()

In [None]:
# Grouped all accounts by account_id with the sum of all other variables to find the number of events for each account
# For example: account_id 100000 has 44 counts and 0 events. account_id 100004 has 89 counts and 2 events(For better understanding, please check "Label" column).

train_grouped = train.groupby(["account_id"]).sum()
train_grouped.sort_values(by= "Label",ascending= True, inplace = True)
train_grouped.reset_index(drop= False, inplace= True)
train_grouped.sort_values(by= "account_id",ascending= True, inplace = True)
train_grouped.reset_index(drop= True, inplace= True)
train_grouped["count"]= account_id_count["count"]

In [None]:
train_grouped.head()

In [None]:
train_grouped= train_grouped.sort_values(by= "Label", ascending= True)
train_grouped.reset_index(drop= True, inplace= True)
train_grouped.head()

In [None]:
#Creating a list of account_ids which have only '0' in Label column(ONLY NON-EVENT ACCOUNTS)

account_list_non_event = []
j= 0
for i in train_grouped["Label"]:
    if i==0:
        account_list_non_event.append(train_grouped["account_id"][j])
    j=j+1
        

In [None]:
len(account_list_non_event)     # there are 2363 accounts which have only '0'

In [None]:
#The dataset is highly imbalanced and biased towards "0". To make it a balance dataset, I have removed all the accounts which have only "0" in Label column.

train = train[~train['account_id'].isin(account_list_non_event)]
train["Label"].value_counts()

In [None]:
#Calculating counts of each account.
train["account_id"].value_counts()

**Feature Selection by VIF(Variance inflation factor)**

In [None]:
outcome, predictors = dmatrices('Label ~ feature1+ feature2+ feature3 +feature4 + feature5+ feature6 +feature7+feature8 + feature9+feature10+feature11+feature12+feature13+feature14+feature15+feature16+feature17+feature18+feature19+feature20+feature21+feature22+feature23+feature24+feature25+feature26+feature27+feature28+feature29+feature30+feature31+feature32+feature33+feature34+feature35+feature36+feature37+feature38+feature39+feature40+feature41+feature42+feature43+feature44+feature45+feature46+feature47+feature48+feature49+ feature50+feature51+feature52+feature53+feature54+feature55+feature56+feature57+feature58+feature59+feature60+feature61+feature62',train, return_type='dataframe')
# calculating VIF for each individual Predictors
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(predictors.values, i) for i in range(predictors.shape[1])]
vif["features"] = predictors.columns
vif

**ANOVA TEST**

In [None]:
anova_test_list= []
del_col_names = []
for i in col:
    mod = ols("Label" + '~' + i, data = train).fit()
    aov_table = sm.stats.anova_lm(mod, typ = 2)
    print(aov_table)
    anova_test_list.append(aov_table)
    if aov_table["PR(>F)"][0] > 0.05:
        del train[i]                                                   #delete all the variables having p-value> 0.05
        del_col_names.append(i)                                        #append deleted variable in a list(del_col_names)

In [None]:
# selected features
new_col= ["feature4","feature15","feature16","feature17","feature22","feature28","feature31","feature36","feature37","feature40","feature41","feature43","feature46","feature53","feature55","feature60"]

**Correlation analysis between features**

In [None]:
f , ax = plt.subplots(figsize = (15, 8))
sns.heatmap(train.loc[:,new_col].corr(), annot = True)

In [None]:
#variable reduction
train.drop(["feature22", "account_id", "week_start_date"], axis=1,inplace= True)

# Splitting into training and testing set 

In [None]:
from sklearn.model_selection import train_test_split
X= train.drop(["Label"], axis=1)
y= train["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69, stratify=y)
# summarize
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))
X_train.shape, X_test.shape, y_train.shape, y_test.shape, X.shape

# LOGISTIC REGRESSION WITHOUT SAMPLING

In [None]:
# Creating the hyperparameter grid 
c_space = np.logspace(-5, 8, 15) 
param_grid = {'C': c_space} 
  
# Instantiating logistic regression classifier 
LR = LogisticRegression() 
  
# Instantiating the GridSearchCV object 
LR_cv = GridSearchCV(LR, param_grid, cv = 5) 
  
LR_cv.fit(X, y) 
  
# Print the tuned parameters and score 
print("Tuned Logistic Regression Parameters: {}".format(LR_cv.best_params_))  
print("Best score is {}".format(LR_cv.best_score_))

In [None]:
LR = LogisticRegression(C= 0.4393970560760795).fit(X_train, y_train)
LR_pred  = LR.predict(X_test)

CM = confusion_matrix(y_test, LR_pred)
CM = pd.crosstab(pd.Series(y_test.values,name="Actual"), pd.Series(LR_pred,name="Predicted"))
print(CM)
print("accuracy_score:", accuracy_score(y_test, LR_pred)*100)

print("f1_score:", f1_score(y_test, LR_pred))
print("roc_auc_score:", roc_auc_score(y_test, LR_pred))
print(classification_report(y_test,LR_pred))

# RF WITHOUT SAMPLING

In [None]:
#Hyper_parameter tunning.

n_estimators=[50, 100, 150, 200, 500]

max_features= ["auto", "sqrt", "log2"]
max_depth=[1, 20, 30, 40, 50]
max_depth.append(None)
min_samples_split = [2, 5, 10, 15, 20]
min_samples_leaf=[2, 5, 1, 15]


grid_param= {'n_estimators': n_estimators,"max_features": max_features,"max_depth": max_depth,"min_samples_split": min_samples_split,"min_samples_leaf": min_samples_leaf}

RF_model = RandomForestClassifier(random_state=1)
RF_random= RandomizedSearchCV(estimator =RF_model,param_distributions=grid_param, n_iter =100,cv= 5, verbose= 2, random_state=42,n_jobs=-1)
# Fit the grid search to the data
RF_random.fit(X_train, y_train)
print(RF_random.best_params_)

In [None]:
RF_model = RandomForestClassifier(n_estimators = 100,max_depth=20,min_samples_split=10,min_samples_leaf=2,max_features='sqrt').fit(X_train, y_train)
RF_pred = RF_model.predict(X_test)

CM = confusion_matrix(y_test, RF_pred)
CM = pd.crosstab(pd.Series(y_test.values,name="Actual"), pd.Series(RF_pred,name="Predicted"))
print(CM)
print("accuracy_score:", accuracy_score(y_test, RF_pred)*100)

print("f1_score:", f1_score(y_test, RF_pred))
print("roc_auc_score:", roc_auc_score(y_test, RF_pred))
print(classification_report(y_test,RF_pred))

# XGBOOST WITHOUT SAMPLING

In [None]:
XGB = XGBClassifier(max_depth=4, learning_rate=0.2, n_estimators=200, min_child_weight=1, random_state=18) 
XGB.fit(X_train, y_train) 
XGB_pred = XGB.predict(X_test) 
CM = confusion_matrix(y_test, XGB_pred)
CM = pd.crosstab(pd.Series(y_test.values,name="Actual"), pd.Series(XGB_pred,name="Predicted"))
print(CM)
print("accuracy_score:", accuracy_score(y_test, XGB_pred)*100)

print("f1_score:", f1_score(y_test, XGB_pred))
print("roc_auc_score:", roc_auc_score(y_test, XGB_pred))
print(classification_report(y_test,XGB_pred))

# DOWN SAMPLING 

In [None]:
minority_class_len = len(train[train["Label"] == 1])
majority_class_indices = train[train["Label"]== 0].index

random_majority_indices = np.random.choice(majority_class_indices, minority_class_len, replace = False)
minority_class_indices = train[train["Label"]== 1].index
print(minority_class_indices)
under_sample_indices = np.concatenate([minority_class_indices, random_majority_indices])
under_sample = train.loc[under_sample_indices]

In [None]:
under_sample["Label"].value_counts()

In [None]:
X= under_sample.drop(["Label"], axis=1)
y= under_sample["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=36, stratify=y)
# summarize
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

# LOGISTIC REGRESSION DOWNSAMPLING

In [None]:
LR_DS = LogisticRegression().fit(X_train, y_train)
LR_DS_pred  = LR_DS.predict(X_test)

CM = confusion_matrix(y_test, LR_DS_pred)
CM = pd.crosstab(pd.Series(y_test.values,name="Actual"), pd.Series(LR_DS_pred,name="Predicted"))
print(CM)
print("accuracy_score:", accuracy_score(y_test, LR_DS_pred)*100)

print("f1_score:", f1_score(y_test, LR_DS_pred))
print("roc_auc_score:", roc_auc_score(y_test, LR_DS_pred))
print(classification_report(y_test,LR_DS_pred))

# RANDOM FOREST DOWNSMPLING

In [None]:
RF_DS = RandomForestClassifier(n_estimators = 500,max_depth=30,min_samples_split=10,min_samples_leaf=1,max_features='log2').fit(X_train, y_train)
RF_DS_pred = RF_DS.predict(X_test)

CM = confusion_matrix(y_test, RF_DS_pred)
CM = pd.crosstab(pd.Series(y_test.values,name="Actual"), pd.Series(RF_DS_pred,name="Predicted"))
print(CM)
print("accuracy_score:", accuracy_score(y_test, RF_DS_pred)*100)

print("f1_score:", f1_score(y_test, RF_DS_pred))
print("roc_auc_score:", roc_auc_score(y_test, RF_DS_pred))
print(classification_report(y_test,RF_DS_pred))

# XGBOOST DOWNSAMPLING

In [None]:
XGB_DS = XGBClassifier()  
XGB_DS.fit(X_train, y_train)  
XGB_DS_pred = XGB_DS.predict(X_test)  
CM = confusion_matrix(y_test, XGB_DS_pred) 
CM = pd.crosstab(pd.Series(y_test.values,name="Actual"), pd.Series(XGB_DS_pred,name="Predicted")) 
print(CM) 
print("accuracy_score:", accuracy_score(y_test, XGB_DS_pred)*100) 
print("f1_score:", f1_score(y_test, XGB_DS_pred)) 
print("roc_auc_score:", roc_auc_score(y_test, XGB_DS_pred)) 
print(classification_report(y_test,XGB_DS_pred)) 

 

# UP SAMPLING USING SMOTE

In [None]:
X= train.drop(["Label"], axis=1)
y= train["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3, stratify=y)
# summarize
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 
  
# import SMOTE module from imblearn library 
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) 
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 

# LR UPSAMPLING

In [None]:
LR_UP = LogisticRegression().fit(X_train_res, y_train_res)
LR_UP_pred  = LR_UP.predict(X_test)

CM = confusion_matrix(y_test, LR_UP_pred)
CM = pd.crosstab(pd.Series(y_test.values,name="Actual"), pd.Series(LR_UP_pred,name="Predicted"))
print(CM)
print("accuracy_score:", accuracy_score(y_test, LR_UP_pred)*100)

print("f1_score:", f1_score(y_test, LR_UP_pred))
print("roc_auc_score:", roc_auc_score(y_test, LR_UP_pred))
print(classification_report(y_test,LR_UP_pred))

# RF UPSAMPLING

In [None]:
RF_UP_model = RandomForestClassifier(n_estimators = 500,max_depth=30,min_samples_split=10,min_samples_leaf=1,max_features='log2').fit(X_train_res, y_train_res)
RF_UP_pred = RF_UP_model.predict(X_test)

CM = confusion_matrix(y_test, RF_UP_pred)
CM = pd.crosstab(pd.Series(y_test.values,name="Actual"), pd.Series(RF_UP_pred,name="Predicted"))
print(CM)
print("accuracy_score:", accuracy_score(y_test, RF_UP_pred)*100)

print("f1_score:", f1_score(y_test, RF_UP_pred))
print("roc_auc_score:", roc_auc_score(y_test, RF_UP_pred))
print(classification_report(y_test,RF_UP_pred))

# Predict on test dataset by using XGBOOST(without sampling training method) beacause it has better precision and recall score.


In [None]:
df_test= test.copy()

In [None]:
col = train.columns

In [None]:
df_test= df_test[col]
df_test.drop(["Label"], axis=1, inplace= True)
Label= test["Label"]
df_test.head()

In [None]:
XGB_pred  = XGB.predict(df_test)
CM = confusion_matrix(Label, XGB_pred)
CM = pd.crosstab(pd.Series(Label.values,name="Actual"), pd.Series(XGB_pred,name="Predicted"))
print(CM)
print("accuracy_score:", accuracy_score(Label, XGB_pred)*100)

print("f1_score:", f1_score(Label, XGB_pred))
print("roc_auc_score:", roc_auc_score(Label, XGB_pred))
print(classification_report(Label, XGB_pred))

In [None]:
y_pred_proba = XGB.predict_proba(df_test)
print(roc_auc_score(Label, y_pred_proba[:,1]))
plot_precision_recall_curve(XGB,df_test,Label)

In [None]:
Final_label= pd.DataFrame(data= XGB_pred, columns= ["predicted_label"])

In [None]:
final_output= pd.concat([test["account_id"],Final_label["predicted_label"]], axis=1)

In [None]:
final_output