In [5]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import sqlalchemy
import psycopg2

---

## Split the Data into Training and Testing Sets

In [6]:
#create sql engine
engine = sqlalchemy.create_engine("postgresql+psycopg2://postgres:postgres@localhost/project4")

In [7]:
#read in sql data
with engine.connect() as conn:
    insurance_df = pd.read_sql('select * from healthinsurance;', conn)
insurance_df.head()

Unnamed: 0,age,sex,weight,bmi,no_of_dependents,smoker,bloodpressure,diabetes,regular_ex,bin_claim
0,60.0,0.0,64,24.3,1,0,72,0,0,medium-high
1,49.0,1.0,75,22.6,1,0,78,1,1,medium-low
2,32.0,1.0,64,17.8,2,1,88,1,1,high
3,61.0,1.0,53,36.4,1,1,72,1,0,high
4,19.0,1.0,50,20.6,0,0,82,1,0,low


### Step 2: Create the labels set (`y`)  from the “claim” column, and then create the features (`X`) DataFrame from the remaining columns.

In [8]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = insurance_df["bin_claim"]

# Separate the X variable, the features
X = insurance_df.drop(columns="bin_claim")

In [9]:
# Review the y variable Series
print(y)

0        medium-high
1         medium-low
2               high
3               high
4                low
            ...     
13643           high
13644     medium-low
13645            low
13646     medium-low
13647           high
Name: bin_claim, Length: 13648, dtype: object


In [10]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,age,sex,weight,bmi,no_of_dependents,smoker,bloodpressure,diabetes,regular_ex
0,60.0,0.0,64,24.3,1,0,72,0,0
1,49.0,1.0,75,22.6,1,0,78,1,1
2,32.0,1.0,64,17.8,2,1,88,1,1
3,61.0,1.0,53,36.4,1,1,72,1,0
4,19.0,1.0,50,20.6,0,0,82,1,0


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [11]:
# Check the balance of our target values
print(y.value_counts())

medium-low     3422
low            3420
high           3406
medium-high    3400
Name: bin_claim, dtype: int64


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [12]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

---

## Create a Logistic Regression Model with the Original Data

Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [13]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [14]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
4301,medium-low,medium-low
9881,high,high
3313,high,high
3494,low,low
12702,high,high
...,...,...
6260,medium-high,medium-high
5314,medium-high,medium-low
5634,medium-high,medium-low
2331,medium-high,medium-high


Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [15]:
# Print the balanced_accuracy score of the model
print( f'The balanced accuracy score of the model is : {balanced_accuracy_score(y_test, predictions)}')

The balanced accuracy score of the model is : 0.7333553943333612


In [16]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, predictions)
test_matrix

array([[578, 107,  96,  70],
       [  3, 695,   0, 157],
       [ 43,  49, 680,  78],
       [ 15,  44, 248, 549]])

In [17]:
# Print the classification report for the model
testing_report = classification_report(y_test, predictions)
print(testing_report)

              precision    recall  f1-score   support

        high       0.90      0.68      0.78       851
         low       0.78      0.81      0.79       855
 medium-high       0.66      0.80      0.73       850
  medium-low       0.64      0.64      0.64       856

    accuracy                           0.73      3412
   macro avg       0.75      0.73      0.73      3412
weighted avg       0.75      0.73      0.73      3412



---

## Predict a Logistic Regression Model with Resampled Training Data

Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [18]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
random_over_sampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = random_over_sampler.fit_resample(X_train, y_train)

In [19]:
# Count the distinct values of the resampled labels data
print(y_resampled.value_counts())

medium-low     2566
high           2566
low            2566
medium-high    2566
Name: bin_claim, dtype: int64


Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [20]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
pred_resampled = classifier.predict(X_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [21]:
# Print the balanced_accuracy score of the model 
print( f'The balanced accuracy score of the resampled model is : {balanced_accuracy_score(y_resampled, pred_resampled)}')

The balanced accuracy score of the resampled model is : 0.7451286048324239


In [22]:
# Generate a confusion matrix for the model
Conf_matrix = confusion_matrix(y_resampled, pred_resampled)
Conf_matrix

array([[1757,  246,  330,  233],
       [   3, 2210,    0,  353],
       [ 151,  121, 1968,  326],
       [  29,  223,  601, 1713]])

In [23]:
# Print the classification report for the model
resampled_report = classification_report(y_resampled, pred_resampled)
print(resampled_report)

              precision    recall  f1-score   support

        high       0.91      0.68      0.78      2566
         low       0.79      0.86      0.82      2566
 medium-high       0.68      0.77      0.72      2566
  medium-low       0.65      0.67      0.66      2566

    accuracy                           0.75     10264
   macro avg       0.76      0.75      0.75     10264
weighted avg       0.76      0.75      0.75     10264

