# **Churn Prediction 'Die ZEIT' - Logistic Regression**

#### **Pls install following packaged to your VE**

**balancing:** <br/>
conda install -c conda-forge imbalanced-learn <br/>
**Xgboost classifierXgboost classifier:** <br/>
conda install -c conda-forge xgboost <br/>

### **Used Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Library for timing
from time import time

# Ignore warnings while plotting
import warnings
warnings.filterwarnings("ignore")

# Feature Engineering & Selection modules
from sklearn.preprocessing import MinMaxScaler, StandardScaler
#from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# SUPERVISED LEARNING
# Libraries for classification issues
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Libraries for classification and regression issues
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Libraries for splitting data, hyperparameter tuning & Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

# Librarie for data balancing
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Libraries for model evaluation
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, f1_score
from sklearn.metrics import fbeta_score, accuracy_score, make_scorer, mean_squared_error

# Display the whole content of the data frame
pd.set_option('display.max_columns', None)   # Displays all columns
pd.set_option('display.max_rows', None)      # Displays all rows

# Define decimal places shown in the notebook
pd.options.display.float_format = '{:,.2f}'.format

# Visualization
%matplotlib inline
sns.set_style('whitegrid')

### **Read Data**

In [3]:
pwd

'/Users/chandrakanth/neuefischer/capstone-ZEIT-2020-ds/02_ml_model'

In [5]:
df = pd.read_csv('/Users/chandrakanth/neuefischer/capstone-ZEIT-2020-ds/00_data/f_chtr_churn_traintable_nf.csv')

In [None]:
df.head(8)

### **Data insight**
#### **Overview**

In [None]:
# Numeric and categorical features
shape_init_rows = df.shape[0]
shape_init_cols = df.shape[1]

print('The shape of the data set: {0} rows, {1} columns'.format(shape_init_rows, shape_init_cols))
print('   ')

cont_f = df.select_dtypes(include=['float64', 'int64'])
print('The number of numeric features: {0}'.format(len(cont_f.columns)))

cat_f = df.select_dtypes(include="object")
print('The number of objectlike features: {0}'.format(len(cat_f.columns)))

#### **Correlations**

In [None]:
# Visualize correlations with target variable
df.drop('churn', axis=1).corrwith(df.churn).sort_values().plot(kind='barh',figsize=(10, 50));

## **1 - Data Preprocessing for Modeling**
### **Drop Irrelevant Features**

In [None]:
df.drop(columns=['Unnamed: 0',
                 'auftrag_new_id',
                 'kuendigungs_eingangs_datum',
                 'avg_churn',
                 'training_set'], inplace=True)

### **Drop Redundant Features**

In [None]:
df.drop(columns=['ort',
                 'plz_1',
                 'plz_2'], inplace=True)

### **Dealing with Missing Values**

In [None]:
df.isna().sum().sort_values().tail()

In [None]:
df.dropna(subset=['email_am_kunden'], inplace=True)

In [None]:
# Numeric and categorical features
shape_a_nan_rows = df.shape[0]
shape_a_nan_cols = df.shape[1]


print('The shape of the data set (after dealing with missing values): {0} rows, {1} columns'.format(shape_a_nan_rows, shape_a_nan_cols))
print('By dropping the rows containing missing values, we removed {0} % of the rows \n(compared to the inital data set).'
      .format(round(((shape_init_rows-shape_a_nan_rows)/shape_init_rows)*100, ndigits=3)))

### **Summary Preprocessing**

In [None]:
# Numeric and categorical features
print('The shape of the data set (after dropping columns and removing missing values): {0} rows, {1} columns'.format(df.shape[0], df.shape[1]))
print('   ')

cont_f = df.select_dtypes(include=['float64', 'int64'])
print('The number of numeric features (after dropping): {0}'.format(len(cont_f.columns)))

cat_f = df.select_dtypes(include="object")
print('The number of object features (after dropping): {0}'.format(len(cat_f.columns)))

### **Dealing with DTypes and Dummies**

In [None]:
#df.info(verbose=1, null_counts=True)

#### **DType Conversions**

In [None]:
# Create list with categorical features and converting them -->> dtype = category
category_features = ['kanal',
                     'objekt_name',
                     'aboform_name',
                     'zahlung_rhythmus_name',
                     'zahlung_weg_name',
                     'land_iso_code',
                     'anrede',
                     'titel']

df[category_features] = df[category_features].astype('category')

# Removed temporarily: 'email_am_kunden'

In [None]:
# Convert columns with dates -->> dtype = datetime
df['liefer_beginn_evt'] = pd.to_datetime(df['liefer_beginn_evt'])
df['abo_registrierung_min'] = pd.to_datetime(df['abo_registrierung_min'])
df['nl_registrierung_min'] = pd.to_datetime(df['nl_registrierung_min'])
df['date_x'] = pd.to_datetime(df['date_x'])

In [None]:
# Creating new columns for datelike features (year & month)
df['liefer_beginn_evt_year'] = df['liefer_beginn_evt'].dt.strftime('%Y')
df['liefer_beginn_evt_month'] = df['liefer_beginn_evt'].dt.strftime('%m')

df['abo_registrierung_min_year'] = df['abo_registrierung_min'].dt.strftime('%Y')
df['abo_registrierung_min_month'] = df['abo_registrierung_min'].dt.strftime('%m')

df['nl_registrierung_min_year'] = df['nl_registrierung_min'].dt.strftime('%Y')
df['nl_registrierung_min_month'] = df['nl_registrierung_min'].dt.strftime('%m')

df['date_x_year'] = df['date_x'].dt.strftime('%Y')
df['date_x_month'] = df['date_x'].dt.strftime('%m')

In [None]:
# Dropping original datelike columns
df.drop(columns=['liefer_beginn_evt',
                 'abo_registrierung_min',
                 'nl_registrierung_min',
                 'date_x'], inplace=True)

In [None]:
# Replace values for foreign countrys ('xx') with '000' and convert dtype
df['plz_3'] = df['plz_3'].replace('xx', '000').astype('int')

#### **Dummy Creation**

In [None]:
# List with varibales to be converted into dummies
date_dum_features = ['liefer_beginn_evt_year',
                 'liefer_beginn_evt_month',
                 'abo_registrierung_min_year',
                 'abo_registrierung_min_month',
                 'nl_registrierung_min_year',
                 'nl_registrierung_min_month',
                 'date_x_year',
                 'date_x_month']

In [None]:
# Creating dummies
dummy_df1 = pd.get_dummies(df[category_features], drop_first=True)
dummy_df2 = pd.get_dummies(df[date_dum_features], drop_first=True)

In [None]:
# Columns containing initial information for dummy columns
df.drop(columns=category_features, inplace=True)
df.drop(columns=date_dum_features, inplace=True)

#### **Creat Clean df for Modeling (Concatinate Dummies to df)**

In [None]:
df_model = pd.concat([df, dummy_df1,dummy_df2], axis=1)

In [None]:
#df_model.info(verbose=1, null_counts=True)

## **2 - Modeling**
### **Data Selection**
+ y = Target variable -->>. ```churn```
+ X = Predictors -->> ```remaining columns```

In [None]:
X = df_model.drop('churn',axis=1)
y = df_model['churn']

In [None]:
print('The target variable (y) has {0} rows.'.format(y.shape[0]))
print('   ')
print('The predictor variables (X) have {0} rows and {1} columns.'.format(X.shape[0], X.shape[1]))

### **Train-Test-Split**

In [None]:
# Splitting the data 
test_size = 0.3
RSEED = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state = RSEED)

In [None]:
print('The test size is {} % of the preprocessed data set.'.format(test_size*100))
print('   ')
print('The train set has {0} rows.'.format(X_train.shape[0]))
print('   ')
print('The test set has {0} rows.'.format(X_test.shape[0]))

### **Function for: Predicting the Target Value ('churn') & Evaluating the Model**

In [None]:
# defining a function for prediction
def predict(X_train, X_test, y_train, y_test, model):
    
    '''
    inputs:
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
       - model: the model algorithm to be trained and predicted on
    '''
    
    results = {}
    
    # Fit the learner to the training data 
    start = time() # Get start time
    model = model.fit(X_train ,y_train)
    end = time() # Get end time
      
    # Calculate the training time
    results['train_time'] = end - start
        
    # Get the predictions on the test set and training set,
    start = time() # Get start time
    predictions_test = model.predict(X_test)
    predictions_train = model.predict(X_train)
    predictions_test_prob = model.predict_proba(X_test)
    predictions_train_prob = model.predict_proba(X_train)
    end = time() # Get end time
    
    '''
    Evaluation through different parameters
    '''
    
    # Calculate the total prediction time
    results['pred_time'] = end - start
    
    # Compute accuracy on the train set
    results['acc_train'] = accuracy_score(y_train,predictions_train)
        
    # Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    
    # Compute Precision_score on the train set
    results['Precision_train'] = precision_score(y_train, predictions_train)
    
    # Compute Precision_score on the test set
    results['Precision_test'] = precision_score(y_test, predictions_test)
    
    # Compute Recall_score on the train set
    results['Recall_train'] = recall_score(y_train ,predictions_train)
    
    # Compute Recall_score on the test set
    results['Recall_test'] = recall_score(y_test, predictions_test)
    
    # Final results
    print ("{} trained .".format(model.__class__.__name__))
    
    # Return the results
    return results

In [None]:
# Initialize the two models
model_NB = GaussianNB(var_smoothing=1e-09)

model_LG =  LogisticRegression()

model_KNN = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

model_RF = RandomForestClassifier(n_estimators=500, min_samples_split = 2, 
                               max_leaf_nodes = 50, max_depth = 25, 
                               bootstrap = True, max_features = 'auto',   
                               n_jobs=-1, verbose = 1, random_state=RSEED)

model_XGB = XGBClassifier(n_estimators = 200, gamma = 100, 
                      learning_rate = 0.01, max_depth = 12, booster = 'gbtree',
                      scale_pos_weight = 1.5, objective='binary:logistic')


results = {}
for model in [model_NB, model_LG, model_KNN, model_RF, model_XGB]:
    model_name = model.__class__.__name__
    results[model_name] = {}
    results[model_name] = \
    predict(X_train, X_test, y_train, y_test, model)

In [None]:
# Displaying the results of predictions
for i in results.items():
    print (i[0])
    display(pd.DataFrame.from_dict(i[1], orient='index').rename(columns={0:'uncleaned data'}))

In [None]:
#print(classification_report(y_test, y_pred_knn))

#print(results['KNeighborsClassifier'])

In [None]:
# plot heatmap
#        conf_mat = pd.crosstab(np.ravel(y_train), np.ravel(y_train_pred),
#                               colnames=["Predicted"], rownames=["Actual"])
    
#        sns.heatmap(conf_mat/np.sum(conf_mat), annot=True, cmap="Blues", fmt=".2%")
#        plt.show()
#        plt.close()

######################

#print("\nResults on test data:")
#print(classification_report(y_test, y_test_pred))

In [None]:
results_df = pd.DataFrame(results).transpose()