In [2]:
# import utility modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

# import ml tools for prediction
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
# load dataset and print counts
df = pd.read_csv('data/train8(all_dummy).csv')
cols_to_drop = ['id', 'ni.age', 'len.at.res', 'premium']
df = df.drop(cols_to_drop, axis=1)
print(df['cancel'].value_counts())
df.loc[(df['cancel'] == 2)] = 1 # replace all 2's w/ 1's
print(df['cancel'].value_counts())

0    713637
2    222979
1     73486
Name: cancel, dtype: int64
0    713637
1    296465
Name: cancel, dtype: int64


In [18]:
# split and train w/o balancing
X = df.drop('cancel', axis=1).values
y = df['cancel'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [19]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)
pred = model.predict(X_test)
# compare_df = pd.DataFrame({'Actual':y_test, 'Predicted': pred})
print("--UNBALANCED DATA--")
print(classification_report(y_test, pred))

--UNBALANCED DATA--
              precision    recall  f1-score   support

           0       0.91      1.00      0.95    213737
           1       1.00      0.75      0.86     89294

    accuracy                           0.93    303031
   macro avg       0.95      0.88      0.90    303031
weighted avg       0.93      0.93      0.92    303031



In [20]:
model.score(X_test, y_test)

0.9262187696968297

In [21]:
# print confusion matrix
cm = confusion_matrix(y_test, pred)
print(cm)

[[213506    231]
 [ 22127  67167]]


In [22]:
print("Actual count of categories in 'cancel' w/o balancing:")
print("category '0': {}".format(sum(y_train == 0)))
print("category '1 or 2': {}".format(sum(y_train == 1)))

Actual count of categories in 'cancel' w/o balancing:
category '0': 499900
category '1 or 2': 207171


In [23]:
# split and train using oversampling
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [24]:
print("New count of categories in 'cancel' w/ balancing:")
print("category '0': {}".format(sum(y_train_res == 0)))
print("category '1 or 2': {}".format(sum(y_train_res == 1)))

New count of categories in 'cancel' w/ balancing:
category '0': 499900
category '1 or 2': 499900


In [25]:
print("--BALANCED DATA--")
balanced_model = LogisticRegression()
balanced_model.fit(X_train_res, y_train_res.ravel())
pred = balanced_model.predict(X_test)
print(classification_report(y_test, pred))

--BALANCED DATA--


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.91      0.95      0.93    213737
           1       0.87      0.76      0.81     89294

    accuracy                           0.90    303031
   macro avg       0.89      0.86      0.87    303031
weighted avg       0.89      0.90      0.89    303031



In [26]:
# print confusion matrix
cm = confusion_matrix(y_test, pred)
print(cm)

[[203379  10358]
 [ 21186  68108]]


In [27]:
model.score(X_test, y_test)

0.9262187696968297