In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
%pip install openpyxl

In [None]:
# Read in the data
df = pd.read_excel('e-commerce-dataset.xlsx', sheet_name='E_Comm')
df.to_csv('e-commerce_churn.csv', index=False)
df = pd.read_csv('e-commerce_churn.csv')

In [None]:
print(df.info())
df = df.dropna()

In [None]:
df['Churn'] = df['Churn'].astype('category')

# Identify non-numeric columns
non_numeric_cols = df.select_dtypes(include=['object']).columns

# Apply one-hot encoding to non-numeric columns
df = pd.get_dummies(df, columns=non_numeric_cols, drop_first=True)

## Exercise T8.3 a)

Your colleague proposes to train the model on the entire dataset and argues to tune the n_estimators
and max_features parameters of sklearn.ensemble.RandomForestClassifier until the training
accuracy is maximized. Do you agree? If not, which issues can you identify with this approach?

What is the accuracy of your random forest model on the training dataset?


In [None]:
# Train a random forest model
train_model = RandomForestClassifier(n_estimators=5, max_features=3, random_state=2023+2024)

X = df.drop(columns=["Churn"])
y = df["Churn"]

train_model.fit(X,y)

# Calculate accuracy
pred = train_model.predict(X)
error_rate = np.mean(y != pred)
print("Error rate:", error_rate)
print("Accuracy:", accuracy_score(y, pred))

## Exercise T8.3 c)

Perform training, 4-fold cross-validation, and testing with a 60-20-20 % split in Python. Use the
precision as metric for model selection. Build a confusion matrix for the test set and report precision,
accuracy, and recall.

In [None]:
# Train-test split
train_df, test_df = train_test_split(df, test_size=0.20, stratify=df['Churn'], random_state=2023+2024)

# Check label balancing
print("Label Balancing in Train Set:\n", train_df['Churn'].value_counts(normalize=True))
print("Label Balancing in Test Set:\n", test_df['Churn'].value_counts(normalize=True))

# Define some Models
train_model_1 = RandomForestClassifier(n_estimators=5,
                                       max_features=3,
                                       random_state=2023+2024)

train_model_2 = RandomForestClassifier(n_estimators=8,
                                       max_features=3,
                                       random_state=2023+2024)

train_model_3 = RandomForestClassifier(n_estimators=5,
                                       max_features=10,
                                       random_state=2023+2024)

# Split features and labels
X = train_df.drop(columns=["Churn"])
y = train_df["Churn"]

# Perform cross-validation on all three models and choose the one with the highest
#  average precision (across all folds)
best_model = None
best_score = -1
for i, train_model in enumerate([train_model_1, train_model_2, train_model_3]):
  print("train_model_"+str(i+1)+":")
  score = np.mean(cross_val_score(train_model, X, y, cv=4, scoring="precision"))
  print("Average score across all folds:", score)
  if score >= best_score:
    best_score = score
    best_model = train_model


# Train the final model (no cross-validation)
print("\nbest_model:", best_model.n_estimators, best_model.max_features)
best_model.fit(X, y)

# Report scores on final model
y_pred = best_model.predict(X)
print("\nPrecision:", precision_score(y, y_pred))
print("Accuracy:", accuracy_score(y, y_pred))
print("Recall:", recall_score(y, y_pred))

In [None]:
# Variable Importance Plot
importance_values = best_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance_values})
imp_plot = importance_df.plot(kind='bar', x='Feature', y='Importance', legend=False)
imp_plot.plot()
plt.show()

# Apply on test set
test_predictions = best_model.predict(test_df.drop(columns=['Churn']))
test_probabilities = best_model.predict_proba(test_df.drop(columns=['Churn']))

test_predictions_df = pd.DataFrame({'Churn': test_df['Churn'], 
                                     'Predicted_Churn': test_predictions,
                                     'Probability_Churn=0': test_probabilities[:, 0],
                                     'Probability_Churn=1': test_probabilities[:, 1]})

print(test_predictions_df)

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(test_df['Churn'], test_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

# Precision, accuracy, recall
print("\nTest-Precision:", precision_score(test_df['Churn'], test_predictions))
print("Test-Accuracy:", accuracy_score(test_df['Churn'], test_predictions))
print("Test-Recall:", recall_score(test_df['Churn'], test_predictions))