# Supervised Machine Learning Logistic Regression Model

In [9]:
# import dependencies
from getpass import getpass
import sqlalchemy
from sqlalchemy import Column, Integer, String, ForeignKey, create_engine, text
import psycopg2

import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

# Establish connection with Database and read in table

In [2]:
# SQL Alchemy connection from RDS
secret = getpass('Enter the secret value: ')

args ={
    'host':"ogdataset.c11hekhsylui.us-west-1.rds.amazonaws.com",
    'port':'5432',
    'database':"og_dataset",
    'user':"attritionProject",
    'password':secret
}
engine = create_engine("postgresql://{user}:{password}@{host}:{port}/{database}".format(**args))
connection = engine.connect()

Enter the secret value: ········


In [4]:
# read in table from database
attrition_df = pd.read_sql('SELECT * FROM encoded_data', connection)
attrition_df

Unnamed: 0,Age,Attrition,Distance from Home,Monthly Income,Number Companies Worked,Percent Salary Hike,Total Working Years,Training Times Last Year,Years at Company,Years In Current Role,...,Stock Option Level_1,Stock Option Level_2,Stock Option Level_3,Education Level_1,Education Level_2,Education Level_3,Education Level_4,Education Level_5,Performance Rating_3,Performance Rating_4
0,41,1,1,5993,8,11,8,0,6,4,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,49,0,8,5130,1,23,10,3,10,7,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,37,1,2,2090,6,15,7,3,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33,0,3,2909,1,11,8,3,8,7,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,27,0,2,3468,9,12,6,3,2,2,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,23,2571,4,17,17,3,5,2,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1466,39,0,6,9991,4,15,9,5,7,7,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1467,27,0,4,6142,1,20,6,0,6,2,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1468,49,0,2,5390,2,14,17,3,9,6,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


# Evaluate discrepancy between attrition values

In [5]:
# determine how many yes/no (1/0) values we have
attrition_df["Attrition"].value_counts()

0    1233
1     237
Name: Attrition, dtype: int64

# Split data into training and test groups

In [10]:
# Split our preprocessed data into our features and target arrays
X = attrition_df.drop("Attrition",1).values
y = attrition_df["Attrition"].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

  


# Combination Sampling using Random Oversampling

In [11]:
# Using random oversampling increase the number of minority class (Yes values)
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [12]:
# Count the resampled classes
Counter(y_resampled)

Counter({1: 924, 0: 924})

# Create and Train Model

In [13]:
# create a LogisticRegression instance
model = LogisticRegression(solver="lbfgs", max_iter=150)

In [14]:
# fit/train the model to classify Attrition using the resampled data set
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=150)

# Evaluation Metrics

In [15]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[221,  88],
       [ 14,  45]])

In [16]:
# Assess the Model's Accuracy w/ the Balanced Accuracy Score
balanced_accuracy_score(y_test, y_pred)

0.7389611101969173

In [17]:
# Print the imbalanced classification report to evaluate the precision and sensitivity of the model
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.72      0.76      0.81      0.74      0.54       309
          1       0.34      0.76      0.72      0.47      0.74      0.55        59

avg / total       0.84      0.72      0.76      0.76      0.74      0.54       368

