Data preprocessing from other group (Baseline_Model.ipynb). Balanced Data

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import math
import IPython


# visualization
import seaborn as sns
sns.set(style="white")  
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from pandas.plotting import scatter_matrix

# warnings handler
import warnings
warnings.filterwarnings("ignore")

# Machine Learning Libraries
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import fbeta_score, accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA

# Pipeline
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
import statsmodels.api as sm

In [43]:
df = pd.read_csv("/Users/kevintomas/Desktop/nf-sep-20/capstone kram/Data/no_dupli_and_outl.csv")

In [44]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [45]:
print(df.shape)
df.drop(["avg_churn", "ort", "date_x", "kuendigungs_eingangs_datum", "abo_registrierung_min", "training_set", "liefer_beginn_evt", "nl_registrierung_min"], axis=1, inplace=True)
print(df.shape)

(143053, 169)
(143053, 161)


In [46]:
df=df.dropna()

In [49]:
df.churn.value_counts()

0    99701
1    43340
Name: churn, dtype: int64

**Balancing**

In [47]:
count_class_0, count_class_1 = df.churn.value_counts()

In [48]:
churn_class_0 = df[df['churn'] == 0]
churn_class_1 = df[df['churn'] == 1]

**Upsampling**

In [9]:
churn_class_1_up = churn_class_1.sample(count_class_0, replace=True)

In [10]:
df = pd.concat([churn_class_1_up, churn_class_0], axis=0)

In [11]:
df.churn.value_counts()

1    99701
0    99701
Name: churn, dtype: int64

**Downsampling**

In [50]:
churn_class_1_down = churn_class_0.sample(count_class_1, replace=True)

In [51]:
df = pd.concat([churn_class_1_down, churn_class_1], axis=0)

In [52]:
df.churn.value_counts()

1    43340
0    43340
Name: churn, dtype: int64

In [53]:
y = df['churn']
X = df.drop('churn', axis = 1)
X.shape

(86680, 160)

**Dummies**

In [54]:
other = ["zon_che_opt_in", 
 "zon_sit_opt_in",
 "zon_zp_grey",
 "zon_premium",
 "zon_boa",
 "zon_kommentar",
 "zon_sonstige",
 "zon_zp_red",
 "zon_rawr",
 "zon_community",
 "zon_app_sonstige",
 "zon_schach",
 "zon_blog_kommentare",
 "zon_quiz",
 "nl_zeitbrief",
 "nl_zeitshop",
 "nl_zeitverlag_hamburg",
 "nl_fdz_organisch"]

In [55]:
categoric_features = list(X.columns[X.dtypes==object]) + other

In [56]:
dummies = pd.get_dummies(X, columns=categoric_features, drop_first=True)

**Scaling**

In [57]:
numeric_features = ['lesedauer',
 'shop_kauf',
 'cnt_abo',
 'cnt_abo_diezeit',
 'cnt_abo_diezeit_digital',
 'cnt_abo_magazin',
 'cnt_umwandlungsstatus2_dkey',
 'nl_blacklist_dum',
 'nl_bounced_dum',
 'nl_aktivitaet',
 'nl_sperrliste_dum',
 'nl_opt_in_dum',
 'received_anzahl_1w',
 'received_anzahl_1m',
 'received_anzahl_3m',
 'received_anzahl_6m',
 'opened_anzahl_1w',
 'opened_anzahl_1m',
 'opened_anzahl_3m',
 'openedanzahl_6m',
 'clicked_anzahl_1w',
 'clicked_anzahl_1m',
 'clicked_anzahl_3m',
 'clicked_anzahl_6m',
 'unsubscribed_anzahl_1w',
 'unsubscribed_anzahl_1m',
 'unsubscribed_anzahl_3m',
 'unsubscribed_anzahl_6m',
 'openrate_1w',
 'clickrate_1w',
 'openrate_1m',
 'clickrate_1m',
 'openrate_3m',
 'clickrate_3m',
 'received_anzahl_bestandskunden_1w',
 'received_anzahl_bestandskunden_1m',
 'received_anzahl_bestandskunden_3m',
 'received_anzahl_bestandskunden_6m',
 'opened_anzahl_bestandskunden_1w',
 'opened_anzahl_bestandskunden_1m',
 'opened_anzahl_bestandskunden_3m',
 'openedanzahl_bestandskunden_6m',
 'clicked_anzahl_bestandskunden_1w',
 'clicked_anzahl_bestandskunden_1m',
 'clicked_anzahl_bestandskunden_3m',
 'clicked_anzahl_bestandskunden_6m',
 'unsubscribed_anzahl_bestandskunden_1w',
 'unsubscribed_anzahl_bestandskunden_1m',
 'unsubscribed_anzahl_bestandskunden_3m',
 'unsubscribed_anzahl_bestandskunden_6m',
 'openrate_bestandskunden_1w',
 'clickrate_bestandskunden_1w',
 'openrate_bestandskunden_1m',
 'clickrate_bestandskunden_1m',
 'openrate_bestandskunden_3m',
 'clickrate_bestandskunden_3m',
 'received_anzahl_produktnews_1w',
 'received_anzahl_produktnews_1m',
 'received_anzahl_produktnews_3m',
 'received_anzahl_produktnews_6m',
 'opened_anzahl_produktnews_1w',
 'opened_anzahl_produktnews_1m',
 'opened_anzahl_produktnews_3m',
 'openedanzahl_produktnews_6m',
 'clicked_anzahl_produktnews_1w',
 'clicked_anzahl_produktnews_1m',
 'clicked_anzahl_produktnews_3m',
 'clicked_anzahl_produktnews_6m',
 'unsubscribed_anzahl_produktnews_1w',
 'unsubscribed_anzahl_produktnews_1m',
 'unsubscribed_anzahl_produktnews_3m',
 'unsubscribed_anzahl_produktnews_6m',
 'openrate_produktnews_1w',
 'clickrate_produktnews_1w',
 'openrate_produktnews_1m',
 'clickrate_produktnews_1m',
 'openrate_produktnews_3m',
 'clickrate_produktnews_3m',
 'received_anzahl_hamburg_1w',
 'received_anzahl_hamburg_1m',
 'received_anzahl_hamburg_3m',
 'received_anzahl_hamburg_6m',
 'opened_anzahl_hamburg_1w',
 'opened_anzahl_hamburg_1m',
 'opened_anzahl_hamburg_3m',
 'openedanzahl_hamburg_6m',
 'clicked_anzahl_hamburg_1w',
 'clicked_anzahl_hamburg_1m',
 'clicked_anzahl_hamburg_3m',
 'clicked_anzahl_hamburg_6m',
 'unsubscribed_anzahl_hamburg_1w',
 'unsubscribed_anzahl_hamburg_1m',
 'unsubscribed_anzahl_hamburg_3m',
 'unsubscribed_anzahl_hamburg_6m',
 'openrate_hamburg_1w',
 'clickrate_hamburg_1w',
 'openrate_hamburg_1m',
 'clickrate_hamburg_1m',
 'openrate_hamburg_3m',
 'clickrate_hamburg_3m',
 'received_anzahl_zeitbrief_1w',
 'received_anzahl_zeitbrief_1m',
 'received_anzahl_zeitbrief_3m',
 'received_anzahl_zeitbrief_6m',
 'opened_anzahl_zeitbrief_1w',
 'opened_anzahl_zeitbrief_1m',
 'opened_anzahl_zeitbrief_3m',
 'openedanzahl_zeitbrief_6m',
 'clicked_anzahl_zeitbrief_1w',
 'clicked_anzahl_zeitbrief_1m',
 'clicked_anzahl_zeitbrief_3m',
 'clicked_anzahl_zeitbrief_6m',
 'unsubscribed_anzahl_zeitbrief_1w',
 'unsubscribed_anzahl_zeitbrief_1m',
 'unsubscribed_anzahl_zeitbrief_3m',
 'unsubscribed_anzahl_zeitbrief_6m',
 'openrate_zeitbrief_1w',
 'clickrate_zeitbrief_1w',
 'openrate_zeitbrief_1m',
 'clickrate_zeitbrief_1m',
 'openrate_zeitbrief_3m',
 'clickrate_zeitbrief_3m']

In [58]:
scaler = StandardScaler()

In [59]:
X_scaled = scaler.fit_transform(dummies[numeric_features])

In [60]:
X_preprocessed = np.concatenate([X_scaled, dummies.drop(numeric_features, axis=1)], axis=1)

**Splitting**

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size = .3, random_state=25)

In [62]:
X_train.shape

(60676, 991)

**PCA**

In [63]:
pca = PCA(n_components=None)

In [64]:
X_train_scaled_pca  = pca.fit(X_train)

In [65]:
X_train_trans_pca = pca.transform(X_train)

In [66]:
pca_explained_var = pca.explained_variance_ratio_

cum_explaiend_var = pca_explained_var.cumsum()

In [69]:
sum_variance, component_count = 0, 0
while sum_variance < 0.85:
    sum_variance += pca.explained_variance_ratio_[component_count]
    component_count += 1
    
print('Number of Principal Components that explain >=85% of Variance: ', component_count)
print('Total Variance Explained by '+str(component_count)+' components:', str(sum_variance*100)+'%')

Number of Principal Components that explain >=85% of Variance:  39
Total Variance Explained by 39 components: 85.18678069240588%


**Modelling with 39 components explaining 85% of the variance**

In [70]:
pca = PCA(n_components=38)
pca.fit(X_train)

PCA(n_components=38)

In [71]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [72]:
X_train_pca_cst = sm.add_constant(X_train_pca)
X_test_pca_cst = sm.add_constant(X_test_pca)

Training

In [73]:
model = LogisticRegression().fit(X_train_pca_cst, y_train)

In [74]:
y_pred_train = model.predict(X_train_pca_cst)
y_pred_test = model.predict(X_test_pca_cst)

In [75]:
train_score = accuracy_score(y_train, y_pred_train)*100
test_score = accuracy_score(y_test, y_pred_test)*100
recall_train = recall_score(y_train, y_pred_train)*100
recall_test = recall_score(y_test, y_pred_test)*100
precision_train = precision_score(y_train, y_pred_train)*100
precision_test = precision_score(y_test, y_pred_test)*100


print("Training Set Accuracy:",str(train_score)+'%')
print("Testing Set Accuracy:",str(test_score)+'%')
print("Training Set Recall:",str(recall_train)+'%')
print("Testing Set Recall:",str(recall_test)+'%')
print("Training Set Precision:",str(precision_train)+'%')
print("Testing Set Precision:",str(precision_test)+'%')

Training Set Accuracy: 65.48388160063287%
Testing Set Accuracy: 65.40147669589294%
Training Set Recall: 68.66999636159163%
Testing Set Recall: 68.67322804608224%
Training Set Precision: 64.41114420451724%
Testing Set Precision: 64.79268643823784%


**Upsampled**<br/>
Training Set Accuracy: 65.59130540689635%<br/>
Testing Set Accuracy: 65.23294495244146%<br/>
Training Set Recall: 69.09255429162357%<br/>
Testing Set Recall: 68.70366060444859%<br/>
Training Set Precision: 64.47181494089581%<br/>
Testing Set Precision: 64.47627071047458%<br/>

**Downsampled**<br/>
Training Set Accuracy: 65.48388160063287%<br/>
Testing Set Accuracy: 65.40147669589294%<br/>
Training Set Recall: 68.66999636159163%<br/>
Testing Set Recall: 68.67322804608224%<br/>
Training Set Precision: 64.41114420451724%<br/>
Testing Set Precision: 64.79268643823784%<br/>

**Modelling with 74 components explaining 95% of the variance**

In [76]:
pca = PCA(n_components=74)
pca.fit(X_train)

PCA(n_components=74)

In [77]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [78]:
X_train_pca_cst = sm.add_constant(X_train_pca)
X_test_pca_cst = sm.add_constant(X_test_pca)

Training

In [79]:
model = LogisticRegression().fit(X_train_pca_cst, y_train)

In [80]:
y_pred_train = model.predict(X_train_pca_cst)
y_pred_test = model.predict(X_test_pca_cst)

In [81]:
train_score = accuracy_score(y_train, y_pred_train)*100
test_score = accuracy_score(y_test, y_pred_test)*100
recall_train = recall_score(y_train, y_pred_train)*100
recall_test = recall_score(y_test, y_pred_test)*100
precision_train = precision_score(y_train, y_pred_train)*100
precision_test = precision_score(y_test, y_pred_test)*100


print("Training Set Accuracy:",str(train_score)+'%')
print("Testing Set Accuracy:",str(test_score)+'%')
print("Training Set Recall:",str(recall_train)+'%')
print("Testing Set Recall:",str(recall_test)+'%')
print("Training Set Precision:",str(precision_train)+'%')
print("Testing Set Precision:",str(precision_test)+'%')

Training Set Accuracy: 66.87817258883248%
Testing Set Accuracy: 66.88201815105369%
Training Set Recall: 67.67439552806536%
Testing Set Recall: 67.67376211184863%
Training Set Precision: 66.46309771309772%
Testing Set Precision: 66.96866742166856%


**Upsampled**<br/>
Training Set Accuracy: 67.08721101009449%<br/>
Testing Set Accuracy: 66.62877584794637%<br/>
Training Set Recall: 67.93203493048374%<br/>
Testing Set Recall: 67.44688632509892%<br/>
Training Set Precision: 66.70004230714991%<br/>
Testing Set Precision: 66.60319127979513%<br/>

**Downsampled**<br/>
Training Set Accuracy: 66.87817258883248%<br/>
Testing Set Accuracy: 66.88201815105369%<br/>
Training Set Recall: 67.67439552806536%<br/>
Testing Set Recall: 67.67376211184863%<br/>
Training Set Precision: 66.46309771309772%<br/>
Testing Set Precision: 66.96866742166856%<br/>

**Random Forest**

In [82]:
model = RandomForestClassifier().fit(X_train_pca_cst, y_train)

In [83]:
y_pred_train = model.predict(X_train_pca_cst)
y_pred_test = model.predict(X_test_pca_cst)

In [84]:
train_score = accuracy_score(y_train, y_pred_train)*100
test_score = accuracy_score(y_test, y_pred_test)*100
recall_train = recall_score(y_train, y_pred_train)*100
recall_test = recall_score(y_test, y_pred_test)*100
precision_train = precision_score(y_train, y_pred_train)*100
precision_test = precision_score(y_test, y_pred_test)*100


print("Training Set Accuracy:",str(train_score)+'%')
print("Testing Set Accuracy:",str(test_score)+'%')
print("Training Set Recall:",str(recall_train)+'%')
print("Testing Set Recall:",str(recall_test)+'%')
print("Training Set Precision:",str(precision_train)+'%')
print("Testing Set Precision:",str(precision_test)+'%')

Training Set Accuracy: 99.98846331333641%
Testing Set Accuracy: 70.6391324411629%
Training Set Recall: 99.99007706810438%
Testing Set Recall: 66.47592889295795%
Training Set Precision: 99.98676986174505%
Testing Set Precision: 72.88773632256985%


**Upsampled**<br/>
Training Set Accuracy: 99.98567140226822%<br/>
Testing Set Accuracy: 87.27537152504972%<br/>
Training Set Recall: 99.98994599563369%<br/>
Testing Set Recall: 87.51205239884297%<br/>
Training Set Precision: 99.98132988654316%<br/>
Testing Set Precision: 87.22205653312125%<br/>

**Downsampled**<br/>
Training Set Accuracy: 99.98846331333641%<br/>
Testing Set Accuracy: 70.6391324411629%<br/>
Training Set Recall: 99.99007706810438%<br/>
Testing Set Recall: 66.47592889295795%<br/>
Training Set Precision: 99.98676986174505%<br/>
Testing Set Precision: 72.88773632256985%<br/>
