following modules installed

conda install -c conda-forge imbalanced-learn - for balancing  
conda install -c conda-forge xgboost - Xgboost classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from time import time
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, f1_score
from sklearn.metrics import fbeta_score, accuracy_score, make_scorer, mean_squared_error
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from math import sqrt
import itertools

# pd.set_option('display.max.columns', None)
# pd.set_option('display.max.rows', None)

In [2]:
pwd

'/Users/chandrakanth/neuefischer/capstone-ZEIT-2020-ds/02_ml_model'

In [3]:
df = pd.read_csv('/Users/chandrakanth/neuefischer/capstone-ZEIT-2020-ds/00_data/f_chtr_churn_traintable_nf.csv')

In [4]:
df.shape

(209043, 171)

In [5]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,auftrag_new_id,liefer_beginn_evt,kanal,objekt_name,aboform_name,zahlung_rhythmus_name,lesedauer,rechnungsmonat,zahlung_weg_name,...,openrate_zeitbrief_1w,clickrate_zeitbrief_1w,openrate_zeitbrief_1m,clickrate_zeitbrief_1m,openrate_zeitbrief_3m,clickrate_zeitbrief_3m,training_set,kuendigungs_eingangs_datum,churn,date_x
0,0,6BE5B590-07EA-4543-B422-F7ABB2AF9464,2013-10-02,andere,ZEIT Digital,Probeabo,jährlich,68,0,Bankeinzug,...,0.5,0.0,1.0,0.0,0.93,0.0,1,,0,2019-06-18 00:00:00
1,1,2271CE79-EAC2-42EE-9702-BF5CCCF4E1BE,2013-10-02,andere,ZEIT Digital,Probeabo,jährlich,68,0,Bankeinzug,...,0.5,0.0,1.0,0.0,0.93,0.0,1,,0,2019-06-18 00:00:00


In [6]:
df = df.drop("Unnamed: 0", axis=1)
df = df.drop("auftrag_new_id", axis=1)
df = df.drop("kuendigungs_eingangs_datum" , axis=1)
df = df.drop("avg_churn" , axis=1)
df = df.drop("training_set" , axis=1)

In [7]:
df.isna().sum().sort_values().tail()

opened_anzahl_3m      0
clicked_anzahl_1w     0
date_x                0
email_am_kunden      12
ort                  85
dtype: int64

In [8]:
Nan_subset = ['ort', 'email_am_kunden']
df.dropna(subset=Nan_subset, inplace=True)

In [9]:
df.shape

(208958, 166)

In [10]:
# declaring categorical varibale
category_features = ['kanal', 'objekt_name', 'aboform_name', 'zahlung_rhythmus_name',
                    'zahlung_weg_name', 'land_iso_code','anrede', 'titel','email_am_kunden']
df[category_features] = df[category_features].astype("category")

In [11]:
# columns with dates convert to datetype
df['liefer_beginn_evt']= pd.to_datetime(df['liefer_beginn_evt'])
df['abo_registrierung_min']= pd.to_datetime(df['abo_registrierung_min'])
df['nl_registrierung_min']= pd.to_datetime(df['nl_registrierung_min'])
df['date_x']= pd.to_datetime(df['date_x'])
date_features = ['liefer_beginn_evt', 'abo_registrierung_min', 'nl_registrierung_min',
                'kuendigungs_eingangs_datum']

In [12]:
df['liefer_beginn_evt_year'] = df['liefer_beginn_evt'].dt.strftime('%Y')
df['liefer_beginn_evt_month'] = df['liefer_beginn_evt'].dt.strftime('%m')
df['abo_registrierung_min_year'] = df['abo_registrierung_min'].dt.strftime('%Y')
df['abo_registrierung_min_month'] = df['abo_registrierung_min'].dt.strftime('%m')
df['nl_registrierung_min_year'] = df['nl_registrierung_min'].dt.strftime('%Y')
df['nl_registrierung_min_month'] = df['nl_registrierung_min'].dt.strftime('%m')
df['date_x_year'] = df['date_x'].dt.strftime('%Y')
df['date_x_month'] = df['date_x'].dt.strftime('%m')

In [13]:
df = df.drop("liefer_beginn_evt", axis=1)
df = df.drop("abo_registrierung_min", axis=1)
df = df.drop("nl_registrierung_min" , axis=1)
df = df.drop("date_x", axis=1)
df = df.drop("plz_1", axis=1)
df = df.drop("plz_2", axis=1)
df = df.drop("ort", axis=1)

In [14]:
df.drop(index=df[df['plz_3'] == 'xx'].index, inplace=True)
df['plz_3'] = df['plz_3'].astype("int")

In [15]:
# List of varibale for dummies
date_features = ['liefer_beginn_evt_year','liefer_beginn_evt_month','abo_registrierung_min_year',
                 'abo_registrierung_min_month','nl_registrierung_min_year','nl_registrierung_min_month',
                 'date_x_year','date_x_month']

In [16]:
dummy_df1 = pd.get_dummies(df[category_features], drop_first=True)

In [17]:
dummy_df2 = pd.get_dummies(df[date_features], drop_first=True)

In [18]:
df = df.drop(category_features,axis=1)

In [19]:
df = df.drop(date_features,axis=1)

In [20]:
df_model = pd.concat([df, dummy_df1,dummy_df2], axis=1)

In [21]:
df_model.info(verbose=1, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 185531 entries, 0 to 209042
Data columns (total 315 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   lesedauer                              185531 non-null  int64  
 1   rechnungsmonat                         185531 non-null  int64  
 2   studentenabo                           185531 non-null  int64  
 3   plz_3                                  185531 non-null  int64  
 4   metropole                              185531 non-null  int64  
 5   shop_kauf                              185531 non-null  int64  
 6   unterbrechung                          185531 non-null  int64  
 7   zon_che_opt_in                         185531 non-null  int64  
 8   zon_sit_opt_in                         185531 non-null  int64  
 9   zon_zp_grey                            185531 non-null  int64  
 10  zon_premium                            185531 non-null 

In [22]:
X = df_model.drop('churn',axis=1)
y = df_model['churn']

In [23]:
pd.set_option('display.max.columns', None)
X.head()

Unnamed: 0,lesedauer,rechnungsmonat,studentenabo,plz_3,metropole,shop_kauf,unterbrechung,zon_che_opt_in,zon_sit_opt_in,zon_zp_grey,zon_premium,zon_boa,zon_kommentar,zon_sonstige,zon_zp_red,zon_rawr,zon_community,zon_app_sonstige,zon_schach,zon_blog_kommentare,zon_quiz,cnt_abo,cnt_abo_diezeit,cnt_abo_diezeit_digital,cnt_abo_magazin,cnt_umwandlungsstatus2_dkey,nl_zeitbrief,nl_zeitshop,nl_zeitverlag_hamburg,nl_fdz_organisch,nl_blacklist_sum,nl_bounced_sum,nl_aktivitaet,nl_sperrliste_sum,nl_opt_in_sum,boa_reg,che_reg,sit_reg,sso_reg,received_anzahl_1w,received_anzahl_1m,received_anzahl_3m,received_anzahl_6m,opened_anzahl_1w,opened_anzahl_1m,opened_anzahl_3m,openedanzahl_6m,clicked_anzahl_1w,clicked_anzahl_1m,clicked_anzahl_3m,clicked_anzahl_6m,unsubscribed_anzahl_1w,unsubscribed_anzahl_1m,unsubscribed_anzahl_3m,unsubscribed_anzahl_6m,openrate_1w,clickrate_1w,openrate_1m,clickrate_1m,openrate_3m,clickrate_3m,received_anzahl_bestandskunden_1w,received_anzahl_bestandskunden_1m,received_anzahl_bestandskunden_3m,received_anzahl_bestandskunden_6m,opened_anzahl_bestandskunden_1w,opened_anzahl_bestandskunden_1m,opened_anzahl_bestandskunden_3m,openedanzahl_bestandskunden_6m,clicked_anzahl_bestandskunden_1w,clicked_anzahl_bestandskunden_1m,clicked_anzahl_bestandskunden_3m,clicked_anzahl_bestandskunden_6m,unsubscribed_anzahl_bestandskunden_1w,unsubscribed_anzahl_bestandskunden_1m,unsubscribed_anzahl_bestandskunden_3m,unsubscribed_anzahl_bestandskunden_6m,openrate_bestandskunden_1w,clickrate_bestandskunden_1w,openrate_bestandskunden_1m,clickrate_bestandskunden_1m,openrate_bestandskunden_3m,clickrate_bestandskunden_3m,received_anzahl_produktnews_1w,received_anzahl_produktnews_1m,received_anzahl_produktnews_3m,received_anzahl_produktnews_6m,opened_anzahl_produktnews_1w,opened_anzahl_produktnews_1m,opened_anzahl_produktnews_3m,openedanzahl_produktnews_6m,clicked_anzahl_produktnews_1w,clicked_anzahl_produktnews_1m,clicked_anzahl_produktnews_3m,clicked_anzahl_produktnews_6m,unsubscribed_anzahl_produktnews_1w,unsubscribed_anzahl_produktnews_1m,unsubscribed_anzahl_produktnews_3m,unsubscribed_anzahl_produktnews_6m,openrate_produktnews_1w,clickrate_produktnews_1w,openrate_produktnews_1m,clickrate_produktnews_1m,openrate_produktnews_3m,clickrate_produktnews_3m,received_anzahl_hamburg_1w,received_anzahl_hamburg_1m,received_anzahl_hamburg_3m,received_anzahl_hamburg_6m,opened_anzahl_hamburg_1w,opened_anzahl_hamburg_1m,opened_anzahl_hamburg_3m,openedanzahl_hamburg_6m,clicked_anzahl_hamburg_1w,clicked_anzahl_hamburg_1m,clicked_anzahl_hamburg_3m,clicked_anzahl_hamburg_6m,unsubscribed_anzahl_hamburg_1w,unsubscribed_anzahl_hamburg_1m,unsubscribed_anzahl_hamburg_3m,unsubscribed_anzahl_hamburg_6m,openrate_hamburg_1w,clickrate_hamburg_1w,openrate_hamburg_1m,clickrate_hamburg_1m,openrate_hamburg_3m,clickrate_hamburg_3m,received_anzahl_zeitbrief_1w,received_anzahl_zeitbrief_1m,received_anzahl_zeitbrief_3m,received_anzahl_zeitbrief_6m,opened_anzahl_zeitbrief_1w,opened_anzahl_zeitbrief_1m,opened_anzahl_zeitbrief_3m,openedanzahl_zeitbrief_6m,clicked_anzahl_zeitbrief_1w,clicked_anzahl_zeitbrief_1m,clicked_anzahl_zeitbrief_3m,clicked_anzahl_zeitbrief_6m,unsubscribed_anzahl_zeitbrief_1w,unsubscribed_anzahl_zeitbrief_1m,unsubscribed_anzahl_zeitbrief_3m,unsubscribed_anzahl_zeitbrief_6m,openrate_zeitbrief_1w,clickrate_zeitbrief_1w,openrate_zeitbrief_1m,clickrate_zeitbrief_1m,openrate_zeitbrief_3m,clickrate_zeitbrief_3m,kanal_B2B,kanal_E-Mailing,kanal_Eigenwerbung,kanal_Fremdwerbung,kanal_Mailings,kanal_SEA,kanal_Standwerbung,kanal_Telefonmarketing,kanal_andere,objekt_name_DIE ZEIT - CHRIST & WELT,objekt_name_ZEIT Digital,aboform_name_Geschenkabo,aboform_name_Negative Option,aboform_name_Probeabo,aboform_name_Prämienabo,zahlung_rhythmus_name_halbjährlich,zahlung_rhythmus_name_jährlich,zahlung_rhythmus_name_monatlich,zahlung_rhythmus_name_vierteljährlich,zahlung_rhythmus_name_zweijährlich,zahlung_weg_name_Kreditkarte,zahlung_weg_name_PayPal,zahlung_weg_name_Rechnung,land_iso_code_CH,land_iso_code_DE,land_iso_code_andere,anrede_Frau,anrede_Herr,anrede_unbekannt,titel_kein Titel,titel_sonstiger Titel,email_am_kunden_1.0,liefer_beginn_evt_year_2014,liefer_beginn_evt_year_2015,liefer_beginn_evt_year_2016,liefer_beginn_evt_year_2017,liefer_beginn_evt_year_2018,liefer_beginn_evt_year_2019,liefer_beginn_evt_month_02,liefer_beginn_evt_month_03,liefer_beginn_evt_month_04,liefer_beginn_evt_month_05,liefer_beginn_evt_month_06,liefer_beginn_evt_month_07,liefer_beginn_evt_month_08,liefer_beginn_evt_month_09,liefer_beginn_evt_month_10,liefer_beginn_evt_month_11,liefer_beginn_evt_month_12,abo_registrierung_min_year_1958,abo_registrierung_min_year_1959,abo_registrierung_min_year_1960,abo_registrierung_min_year_1961,abo_registrierung_min_year_1962,abo_registrierung_min_year_1963,abo_registrierung_min_year_1964,abo_registrierung_min_year_1965,abo_registrierung_min_year_1966,abo_registrierung_min_year_1967,abo_registrierung_min_year_1968,abo_registrierung_min_year_1969,abo_registrierung_min_year_1970,abo_registrierung_min_year_1971,abo_registrierung_min_year_1972,abo_registrierung_min_year_1973,abo_registrierung_min_year_1974,abo_registrierung_min_year_1975,abo_registrierung_min_year_1976,abo_registrierung_min_year_1977,abo_registrierung_min_year_1978,abo_registrierung_min_year_1979,abo_registrierung_min_year_1980,abo_registrierung_min_year_1981,abo_registrierung_min_year_1982,abo_registrierung_min_year_1983,abo_registrierung_min_year_1984,abo_registrierung_min_year_1985,abo_registrierung_min_year_1986,abo_registrierung_min_year_1987,abo_registrierung_min_year_1988,abo_registrierung_min_year_1989,abo_registrierung_min_year_1990,abo_registrierung_min_year_1991,abo_registrierung_min_year_1992,abo_registrierung_min_year_1993,abo_registrierung_min_year_1994,abo_registrierung_min_year_1995,abo_registrierung_min_year_1996,abo_registrierung_min_year_1997,abo_registrierung_min_year_1998,abo_registrierung_min_year_1999,abo_registrierung_min_year_2000,abo_registrierung_min_year_2001,abo_registrierung_min_year_2002,abo_registrierung_min_year_2003,abo_registrierung_min_year_2004,abo_registrierung_min_year_2005,abo_registrierung_min_year_2006,abo_registrierung_min_year_2007,abo_registrierung_min_year_2008,abo_registrierung_min_year_2009,abo_registrierung_min_year_2010,abo_registrierung_min_year_2011,abo_registrierung_min_year_2012,abo_registrierung_min_year_2013,abo_registrierung_min_year_2014,abo_registrierung_min_year_2015,abo_registrierung_min_year_2016,abo_registrierung_min_year_2017,abo_registrierung_min_year_2018,abo_registrierung_min_year_2019,abo_registrierung_min_month_02,abo_registrierung_min_month_03,abo_registrierung_min_month_04,abo_registrierung_min_month_05,abo_registrierung_min_month_06,abo_registrierung_min_month_07,abo_registrierung_min_month_08,abo_registrierung_min_month_09,abo_registrierung_min_month_10,abo_registrierung_min_month_11,abo_registrierung_min_month_12,nl_registrierung_min_year_2001,nl_registrierung_min_year_2002,nl_registrierung_min_year_2003,nl_registrierung_min_year_2004,nl_registrierung_min_year_2005,nl_registrierung_min_year_2006,nl_registrierung_min_year_2007,nl_registrierung_min_year_2008,nl_registrierung_min_year_2009,nl_registrierung_min_year_2010,nl_registrierung_min_year_2011,nl_registrierung_min_year_2012,nl_registrierung_min_year_2013,nl_registrierung_min_year_2014,nl_registrierung_min_year_2015,nl_registrierung_min_year_2016,nl_registrierung_min_year_2017,nl_registrierung_min_year_2018,nl_registrierung_min_year_2019,nl_registrierung_min_year_2020,nl_registrierung_min_month_02,nl_registrierung_min_month_03,nl_registrierung_min_month_04,nl_registrierung_min_month_05,nl_registrierung_min_month_06,nl_registrierung_min_month_07,nl_registrierung_min_month_08,nl_registrierung_min_month_09,nl_registrierung_min_month_10,nl_registrierung_min_month_11,nl_registrierung_min_month_12,date_x_year_2020,date_x_month_02,date_x_month_03,date_x_month_04,date_x_month_05,date_x_month_06,date_x_month_07,date_x_month_08,date_x_month_09,date_x_month_10,date_x_month_11,date_x_month_12
0,68,0,0,647,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,0,2,2,2,2,0,0,0,1,14,0,0,0,0,0,1,6,14,43,80,4,15,38,74,1,2,2,2,0,0,0,0,0.67,0.25,1.07,0.13,0.88,0.05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,2,5,14,26,1,5,13,25,0,0,0,0,0,0,0,0,0.5,0.0,1.0,0.0,0.93,0.0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,68,0,0,647,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,0,2,2,2,2,0,0,0,1,14,0,0,0,0,0,1,6,14,43,80,4,15,38,74,1,2,2,2,0,0,0,0,0.67,0.25,1.07,0.13,0.88,0.05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,2,5,14,26,1,5,13,25,0,0,0,0,0,0,0,0,0.5,0.0,1.0,0.0,0.93,0.0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,68,0,0,647,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,0,2,2,2,2,0,0,0,1,14,0,0,0,0,0,1,6,14,43,80,4,15,38,74,1,2,2,2,0,0,0,0,0.67,0.25,1.07,0.13,0.88,0.05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,2,5,14,26,1,5,13,25,0,0,0,0,0,0,0,0,0.5,0.0,1.0,0.0,0.93,0.0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,72,0,0,551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,14,0,0,0,0,0,1,11,35,95,183,9,20,29,46,0,1,1,1,0,0,0,0,0.82,0.0,0.57,0.05,0.31,0.03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,3,6,14,27,3,4,5,9,0,0,0,0,0,0,0,0,1.0,0.0,0.67,0.0,0.36,0.0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
5,75,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,16,0,0,0,0,0,1,9,33,83,178,1,3,8,31,0,0,0,1,0,0,0,0,0.11,0.0,0.09,0.0,0.1,0.0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,2,5,13,27,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0


In [24]:
# Splitting the data 
RSEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = RSEED)

In [25]:
pipe_lr = Pipeline([('minmax', MinMaxScaler()), ('lr', LogisticRegression())])

In [26]:
pipe_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('minmax', MinMaxScaler()), ('lr', LogisticRegression())])

In [27]:
score = pipe_lr.score(X_test, y_test)
print("logistic regression pipeline accuracy =", score)

logistic regression pipeline accuracy = 0.7411230522980433


In [28]:
# defining a function for prediction
def predict(X_train, X_test, y_train, y_test, model):
    
    '''
    inputs:
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
       - model: the model algorithm to be trained and predicted on
    '''
    
    results = {}
    
    # Fit the learner to the training data 
    start = time() # Get start time
    model = model.fit(X_train ,y_train)
    end = time() # Get end time
      
    # Calculate the training time
    results['train_time'] = end - start
        
    # Get the predictions on the test set and training set,
    start = time() # Get start time
    predictions_test = model.predict(X_test)
    predictions_train = model.predict(X_train)
    predictions_test_prob = model.predict_proba(X_test)
    predictions_train_prob = model.predict_proba(X_train)
    end = time() # Get end time
    
    '''
    Evaluation through different parameters
    '''
    
    # Calculate the total prediction time
    results['pred_time'] = end - start
    
    # Compute accuracy on the train set
    results['acc_train'] = accuracy_score(y_train,predictions_train)
        
    # Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    
    # Compute Precision_score on the train set
    results['Precision_train'] = precision_score(y_train, predictions_train)
    
    # Compute Precision_score on the test set
    results['Precision_test'] = precision_score(y_test, predictions_test)
    
    # Compute Recall_score on the train set
    results['Recall_train'] = recall_score(y_train ,predictions_train)
    
    # Compute Recall_score on the test set
    results['Recall_test'] = recall_score(y_test, predictions_test)
    
    # Final results
    print ("{} trained .".format(model.__class__.__name__))
    
    # Return the results
    return results

In [29]:
# Initialize the two models
model_A = GaussianNB(var_smoothing=1e-09)

model_B = RandomForestClassifier(n_estimators=500, min_samples_split = 2, 
                               max_leaf_nodes = 50, max_depth = 25, 
                               bootstrap = True, max_features = 'auto',   
                               n_jobs=-1, verbose = 1, random_state=RSEED)

model_C = XGBClassifier(n_estimators = 200, gamma = 100, 
                      learning_rate = 0.01, max_depth = 12, booster = 'gbtree',
                      scale_pos_weight = 1.5, objective='binary:logistic')

model_D = LogisticRegression()


results = {}
for model in [model_A, model_B, model_C, model_D]:
    model_name = model.__class__.__name__
    results[model_name] = {}
    results[model_name] = \
    predict(X_train, X_test, y_train, y_test, model)

GaussianNB trained .


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   18.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.9s finis

RandomForestClassifier trained .
XGBClassifier trained .


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression trained .


In [30]:
# Displaying the results of predictions
for i in results.items():
    print (i[0])
    display(pd.DataFrame.from_dict(i[1], orient='index').rename(columns={0:'uncleaned data'}))

GaussianNB


Unnamed: 0,uncleaned data
train_time,1.260647
pred_time,3.244216
acc_train,0.528466
acc_test,0.527717
Precision_train,0.377679
Precision_test,0.376286
Recall_train,0.803741
Recall_test,0.799061


RandomForestClassifier


Unnamed: 0,uncleaned data
train_time,18.471453
pred_time,3.409663
acc_train,0.725747
acc_test,0.726211
Precision_train,0.769314
Precision_test,0.771344
Recall_train,0.165075
Recall_test,0.164433


XGBClassifier


Unnamed: 0,uncleaned data
train_time,194.963103
pred_time,4.263537
acc_train,0.744033
acc_test,0.740845
Precision_train,0.600985
Precision_test,0.594102
Recall_train,0.519379
Recall_test,0.514402


LogisticRegression


Unnamed: 0,uncleaned data
train_time,5.711427
pred_time,0.640368
acc_train,0.710671
acc_test,0.710221
Precision_train,0.647604
Precision_test,0.641822
Recall_train,0.146889
Recall_test,0.144915


In [64]:
# function for plotting the results and recall scores
def evaluate(results):
    """
    Visualization code to display results of various learners.
    
    inputs:
      - learners: a list of supervised learners
      - stats: a list of dictionaries of the statistic results from 'train_predict()'
      - accuracy: The score for the naive predictor
      - f1: The score for the naive predictor
    """
  
    # Create figure
    fig, ax = plt.subplots(2, 4, figsize = (12,10))

    # Constants
    bar_width = 0.3
    colors = ['#A00000','#00A0A0','#00A000','#00A000']
    
    # Super loop to plot four panels of data
    for k, model in enumerate(results.keys()):
        for j, metric in enumerate(['train_time', 'acc_train', 'Precision_train','Recall_train',
                                    'pred_time', 'acc_test', 'Precision_test','Recall_test']):
            for i in np.arange(4):
                
                # Creative plot code
                ax[j//4, j%4].bar(i+k*bar_width, results[model][i][metric], width = bar_width, color = colors[k])
                ax[j//4, j%4].set_xticks([0.45, 1.45, 2.45, 3.45])
                ax[j//4, j%4].set_xticklabels(["base model data"])
                ax[j//4, j%4].set_xlabel("Training Set Size")
                ax[j//4, j%4].set_xlim((-0.1, 3.0))
    
    ax[0, 0].bar(results[''])
    
    # Add unique y-labels
    ax[0, 0].set_ylabel("Time (in seconds)")
    ax[0, 1].set_ylabel("Accuracy Score")
    ax[0, 2].set_ylabel("Precision")
    ax[0, 3].set_ylabel("Recall")
    ax[1, 0].set_ylabel("Time (in seconds)")
    ax[1, 1].set_ylabel("Accuracy Score")
    ax[1, 2].set_ylabel("Precision")
    ax[1, 3].set_ylabel("Recall")
    
    # Add titles
    ax[0, 0].set_title("Model Training")
    ax[0, 1].set_title("Accuracy Score on Training Subset")
    ax[0, 2].set_title("Precision on Training Subset")
    ax[0, 3].set_title("Recall on Training Subset")
    ax[1, 0].set_title("Model Predicting")
    ax[1, 1].set_title("Accuracy Score on Testing Set")
    ax[1, 2].set_title("Precision on Testing Set")
    ax[1, 3].set_title("Recall on Testing Set")
    
    # Add horizontal lines for naive predictors
    #ax[0, 1].axhline(y = acuuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    #ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    #ax[0, 2].axhline(y = precision, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    #ax[1, 2].axhline(y = precision, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    #ax[0, 3].axhline(y = recall, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    #ax[1, 3].axhline(y = precision, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    
    # Set y-limits for score panels
    ax[0, 1].set_ylim((0, 1))
    ax[0, 2].set_ylim((0, 1))
    ax[0, 3].set_ylim((0, 1))
    ax[1, 1].set_ylim((0, 1))
    ax[1, 2].set_ylim((0, 1))
    ax[1, 3].set_ylim((0, 1))
    

    # Create patches for the legend
    patches = []
    for i, learner in enumerate(results.keys()):
        patches.append(mpatches.Patch(color = colors[i], label = learner))
    plt.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
               loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large')
    
    # Aesthetics
    plt.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10)
    plt.tight_layout()
    plt.show()
    

In [71]:
results_df = pd.DataFrame(results)

In [74]:
results_df = results_df.transpose()

In [76]:
results_df.head(15)

Unnamed: 0,train_time,pred_time,acc_train,acc_test,Precision_train,Precision_test,Recall_train,Recall_test
GaussianNB,1.260647,3.244216,0.528466,0.527717,0.377679,0.376286,0.803741,0.799061
RandomForestClassifier,18.471453,3.409663,0.725747,0.726211,0.769314,0.771344,0.165075,0.164433
XGBClassifier,194.963103,4.263537,0.744033,0.740845,0.600985,0.594102,0.519379,0.514402
LogisticRegression,5.711427,0.640368,0.710671,0.710221,0.647604,0.641822,0.146889,0.144915
