# Import libraries

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data

https://www.kaggle.com/blastchar/telco-customer-churn#WA_Fn-UseC_-Telco-Customer-Churn.csv

In [None]:
df = pd.read_csv('Data/telco_customer_churn.csv', sep=',')

In [None]:
df.columns

In [None]:
df.shape

# Data preprocessing / Feature engineering

#### Select variables

### Initial look at the data

In [None]:
df.head(5)

In [None]:
# Remove customerID and Churn

# selected_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
#                      'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
#                      'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
#                      'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
#                      'MonthlyCharges', 'TotalCharges']
# 
# df[selected_features].head(5)

df.drop(columns=['customerID', 'Churn']).head(5)

In [None]:
df_new = df.drop(columns=['customerID', 'Churn'])

In [None]:
df_new.shape

#### One-hot encoding

In [None]:
categorical_columns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
                       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                       'Contract', 'PaperlessBilling', 'PaymentMethod']

df_new = pd.get_dummies(df_new, columns=categorical_columns)

# df_new = df_new.drop(columns=categorical_columns)

In [None]:
df_new.shape

In [None]:
df_new.columns

In [None]:
df_new.head(5)

# Split into train and test set

In [None]:
X = df_new.values
y = df['Churn'].values

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

# Build and train model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# model = DecisionTreeClassifier()

# model = DecisionTreeClassifier(max_depth=5, min_samples_split=4)

model = RandomForestClassifier(n_estimators=10, max_depth=5, class_weight={'Yes': 1.1}, random_state=10)

In [None]:
model.fit(X_train, y_train) 

In [None]:
print(model.max_depth)
print(model.min_samples_split)
print(model.classes_)

# Make predictions

In [None]:
df_new.columns

In [None]:
test_input = [2.0, 24.4, 1548.65, 
              0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
              0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0,
              1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
              1.0, 0.0, 0.0, 0.0]

In [None]:
prediction = model.predict([test_input])
print(prediction)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

# Performance metrics

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [None]:
print(y_test[:12])
print(y_pred[:12])

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')

print(f'Accuracy: {acc:.4}')
print(f'Precision: {precision:.4}')
print(f'Recall/Sensitivity: {recall:.4}')

# Visualize

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(15,9))
single_tree = model.estimators_[0]
plot_tree(single_tree, filled=True, fontsize=12, feature_names=df_new.columns)
plt.show()

In [None]:
#Alternative/Interactive plotting
from sklearn.tree import export_graphviz
single_tree = model.estimators_[0]
dot_str = export_graphviz(single_tree, out_file=None, filled=True, rounded=True, special_characters=True,
                          impurity=False, feature_names=df_new.columns, class_names=model.classes_)

In [None]:
#print(dot_str) #copy this output to the link below

https://dreampuf.github.io/GraphvizOnline/

# Øvelser - Fine tune model

- Prøv at træne modellen med forskellige værdier for ```max_depth``` og ```min_samples_split```, og se hvordan det påvirker performance metrics.
- Prøv at fjerne kolonner i træningsdata og træn modellen igen. Se hvordan det påvirker performance metrics.
- Test forskellige kombinationer af de to overstående punkter, og se hvor god performance man kan få.
- Importer en Random Forest model og lav prædiktioner med denne på samme måde som overstående.

  ```from sklearn.ensemble import RandomForestClassifier```
- Afprøv andre parametre i Random Forest modellen fx ```n_estimators``` (antal Decision Trees).

# Exercise - Fine tune model

- Try training the model with different values for $max_depth$ and $min_samples_split$, and see how it affects the performance metrics.
- Try removing columns from the training data and retrain the model. See how it affects the performance metrics.
- Test different combinations of the methods in the two previous bullets, and see how high performance you can get.
- Import a Random Forest classification model and make prediction in the same way as above.
  
  ```from sklearn.ensemble import RandomForestClassifier```
- Test other parameters in the Random Forest model e.g. ```n_estimators``` (number of Decision Trees).