# Machine Learning Modeling

---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import joblib

# shows plots in jupyter notebook
%matplotlib inline

# set plot style
sns.set(color_codes=True)

In [2]:
clients_df = pd.read_csv('../data/processed_data.csv')
print('Shape of the dataset: ', clients_df.shape)
clients_df.head()

Shape of the dataset:  (14606, 18)


Unnamed: 0,cons_12m,cons_gas_12m,off_peak_forecast_energy,off_peak_forecast_power,imp_cons,margin_net_pow_ele,net_margin,pow_max,churn,off_peak_mean_energy,off_peak_mean_power,off_peak_diff_energy,off_peak_diff_power,origin,channel,discount_energy,active_products,antiquity
0,0,54946,0.114481,40.606701,0.0,25.44,678.99,43.648,1,0.124787,40.942265,0.020057,3.700961,lxidpiddsbxsbosboudacockeimpuepw,foosdfpfkusacimwkcsosbicdxkicaua,NO_DISCOUNT,MULTIPLE,RECENT
1,4660,0,0.145711,44.311378,0.0,16.38,18.89,13.8,0,0.149609,44.311375,-0.003767,0.177779,kamkkxfxxuwbdslkwifmmcsiusiuosws,MISSING,NO_DISCOUNT,ONE,OLD
2,544,0,0.165794,44.311378,0.0,28.6,6.6,13.856,0,0.170512,44.38545,-0.00467,0.177779,kamkkxfxxuwbdslkwifmmcsiusiuosws,foosdfpfkusacimwkcsosbicdxkicaua,NO_DISCOUNT,ONE,OLD
3,1584,0,0.146694,44.311378,0.0,30.22,25.46,13.2,0,0.15121,44.400265,-0.004547,0.177779,kamkkxfxxuwbdslkwifmmcsiusiuosws,lmkebamcaaclubfxadlmueccxoimlema,NO_DISCOUNT,ONE,OLD
4,4425,0,0.1169,40.606701,52.32,44.91,47.98,19.8,0,0.124174,40.688156,-0.006192,0.162916,kamkkxfxxuwbdslkwifmmcsiusiuosws,MISSING,NO_DISCOUNT,ONE,OLD


---

## Data Preprocessing

In [3]:
def split_data(X, y):
    """
    Split the data into train and test sets.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [4]:
X = clients_df.drop('churn', axis=1)
y = clients_df['churn']

# apply log transformation
X[['cons_12m', 'cons_gas_12m', 'imp_cons', 'pow_max']] = np.log(X[['cons_12m', 'cons_gas_12m', 'imp_cons', 'pow_max']] + 1)

# split data into train and test sets
X_train, X_test, y_train, y_test = split_data(X, y)
print('Shape of the train set: ', X_train.shape, y_train.shape)
print('Shape of the test set: ', X_test.shape, y_test.shape)

Shape of the train set:  (11684, 17) (11684,)
Shape of the test set:  (2922, 17) (2922,)


In [5]:
X.dtypes

cons_12m                    float64
cons_gas_12m                float64
off_peak_forecast_energy    float64
off_peak_forecast_power     float64
imp_cons                    float64
margin_net_pow_ele          float64
net_margin                  float64
pow_max                     float64
off_peak_mean_energy        float64
off_peak_mean_power         float64
off_peak_diff_energy        float64
off_peak_diff_power         float64
origin                       object
channel                      object
discount_energy              object
active_products              object
antiquity                    object
dtype: object

In [6]:
# create a preprocessor for categorical and numerical features: OHE and Standardization
categories = X.select_dtypes(include=['object']).columns.tolist()
numerical = X.select_dtypes(include=['float64']).columns.tolist()

one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
standard_scaler = StandardScaler()

---

## Baseline Model


In [12]:
def print_scores(y_true, y_pred):
    print(f'Accuacy: {accuracy_score(y_true, y_pred):.2f}')
    print(f'Precision: {precision_score(y_true, y_pred, zero_division=0):.2f}')
    print(f'Recall: {recall_score(y_true, y_pred, zero_division=0):.2f}')
    print(f'F1: {f1_score(y_true, y_pred, zero_division=0):.2f}')

In [15]:
# first baseline
baseline_1_guesses = np.random.choice([0, 1], size=len(clients_df), p=[.9, .1])
print_scores(clients_df.churn, baseline_1_guesses)

Accuacy: 0.82
Precision: 0.11
Recall: 0.11
F1: 0.11


## Logistic Regression


In [20]:
def try_model(model):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print('Train scores:')
    print_scores(y_train, y_train_pred)
    print('\r\n')
    print('Test scores:')
    print_scores(y_test, y_test_pred)

In [21]:
lreg = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('one_hot_encoder', one_hot_encoder, categories),
        ('standard_scaler', standard_scaler, numerical)
    ])),
    ('classifier', LogisticRegression(random_state=42, max_iter=500))
])

try_model(lreg)

Train scores:
Accuacy: 0.90
Precision: 0.36
Recall: 0.01
F1: 0.01


Test scores:
Accuacy: 0.90
Precision: 0.00
Recall: 0.00
F1: 0.00
