<a href="https://colab.research.google.com/github/juma-paul/deep_learning/blob/main/Logisitc_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression Classification

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Import necessary packages
import pandas as pd
import numpy as np

In [3]:
# Load the dataset
path_to_data = '/content/drive/MyDrive/Datasets/DataScienceCaseStudy/final_clean_data.csv'
insurance = pd.read_csv(path_to_data)

In [4]:
insurance.head()

Unnamed: 0,pet_name,service,check up,dental cleaning,lab test,surgery,vaccination,x-ray,auth_status
0,Sophie,vaccination,0.0,0.0,0.0,0.0,1162.17,0.0,Approved
1,Sophie,vaccination,0.0,0.0,0.0,0.0,921.42,0.0,Approved
2,Sophie,vaccination,0.0,0.0,0.0,0.0,1295.25,0.0,Approved
3,Harper,lab test,0.0,0.0,1712.08,0.0,0.0,0.0,Approved
4,Harper,lab test,0.0,0.0,1070.22,0.0,0.0,0.0,Approved


In [5]:
# Extract the required columns
insurance = insurance[['service', 'check up', 'dental cleaning', 'lab test',
       'surgery', 'vaccination', 'x-ray', 'auth_status']].copy()

In [6]:
insurance.head()

Unnamed: 0,service,check up,dental cleaning,lab test,surgery,vaccination,x-ray,auth_status
0,vaccination,0.0,0.0,0.0,0.0,1162.17,0.0,Approved
1,vaccination,0.0,0.0,0.0,0.0,921.42,0.0,Approved
2,vaccination,0.0,0.0,0.0,0.0,1295.25,0.0,Approved
3,lab test,0.0,0.0,1712.08,0.0,0.0,0.0,Approved
4,lab test,0.0,0.0,1070.22,0.0,0.0,0.0,Approved


### 1. Preprocess  and split the data into training and test setsdata

In [7]:
# Replace all Penied stauses as Denied
insurance['auth_status'] = insurance['auth_status'].replace('Pending', 'Denied')
np.unique(insurance['auth_status'])

array(['Approved', 'Denied'], dtype=object)

In [8]:
# Encode approved as 1 and Denied as 0
insurance['auth_status'] = insurance['auth_status'].replace({'Approved': 1, 'Denied': 0})
np.unique(insurance['auth_status'])

array([0, 1])

In [9]:
insurance.sample(3)

Unnamed: 0,service,check up,dental cleaning,lab test,surgery,vaccination,x-ray,auth_status
433,check up,332.33,0.0,0.0,0.0,0.0,0.0,1
862,x-ray,0.0,0.0,0.0,0.0,0.0,177.19,0
683,surgery,0.0,0.0,0.0,29.84,0.0,0.0,1


In [10]:
# Divide the data into features (X) and label (y)
X = insurance.drop(columns=['auth_status'])
y = insurance['auth_status']

In [11]:
X.tail()

Unnamed: 0,service,check up,dental cleaning,lab test,surgery,vaccination,x-ray
926,lab test,0.0,0.0,427.07,0.0,0.0,0.0
927,lab test,0.0,0.0,653.31,0.0,0.0,0.0
928,lab test,0.0,0.0,1254.74,0.0,0.0,0.0
929,lab test,0.0,0.0,540.97,0.0,0.0,0.0
930,lab test,0.0,0.0,1683.93,0.0,0.0,0.0


In [12]:
y.sample(3)

Unnamed: 0,auth_status
738,0
49,1
760,1


In [13]:
X.dtypes

Unnamed: 0,0
service,object
check up,float64
dental cleaning,float64
lab test,float64
surgery,float64
vaccination,float64
x-ray,float64


In [33]:
# Exract categorical and numerical datatypes
categorical_data = X.select_dtypes(['object']).columns
numeric_data = X.select_dtypes(['float64', 'int64']).columns

In [15]:
# Encode all categorical varialbles using one-hot encoder
X = pd.get_dummies(X, categorical_data, drop_first=True)

In [16]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Scale training set using standard scaler and transform test set using the same scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train[numeric_data] = scaler.fit_transform(X_train[numeric_data])
X_test[numeric_data] = scaler.transform(X_test[numeric_data])

In [18]:
X_train

Unnamed: 0,check up,dental cleaning,lab test,surgery,vaccination,x-ray,service_dental cleaning,service_lab test,service_surgery,service_vaccination,service_x-ray
54,-0.423748,-0.377512,-0.235114,-0.297273,-0.330528,-0.410106,False,True,False,False,False
299,3.550615,-0.377512,-0.314120,-0.297273,-0.330528,-0.410106,False,False,False,False,False
501,-0.423748,1.080678,-0.314120,-0.297273,-0.330528,-0.410106,True,False,False,False,False
810,1.314285,-0.377512,-0.314120,-0.297273,-0.330528,-0.410106,False,False,False,False,False
394,-0.423748,-0.377512,-0.314120,-0.297273,-0.330528,2.583926,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
106,-0.423748,-0.377512,-0.314120,3.337276,-0.330528,-0.410106,False,False,True,False,False
270,3.063117,-0.377512,-0.314120,-0.297273,-0.330528,-0.410106,False,False,False,False,False
860,-0.423748,-0.377512,-0.314120,-0.297273,-0.330528,1.822302,False,False,False,False,True
435,0.702760,-0.377512,-0.314120,-0.297273,-0.330528,-0.410106,False,False,False,False,False


In [19]:
X_test

Unnamed: 0,check up,dental cleaning,lab test,surgery,vaccination,x-ray,service_dental cleaning,service_lab test,service_surgery,service_vaccination,service_x-ray
828,-0.423748,-0.377512,-0.314120,4.570187,-0.330528,-0.410106,False,False,True,False,False
70,-0.423748,-0.377512,-0.314120,3.158068,-0.330528,-0.410106,False,False,True,False,False
630,-0.423748,-0.377512,-0.314120,-0.225214,-0.330528,-0.410106,False,False,True,False,False
506,0.337885,-0.377512,-0.314120,-0.297273,-0.330528,-0.410106,False,False,False,False,False
842,-0.423748,-0.377512,-0.314120,1.173311,-0.330528,-0.410106,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
2,-0.423748,-0.377512,-0.314120,-0.297273,3.153059,-0.410106,False,False,False,True,False
834,-0.423748,-0.377512,-0.314120,3.599202,-0.330528,-0.410106,False,False,True,False,False
735,3.335062,-0.377512,-0.314120,-0.297273,-0.330528,-0.410106,False,False,False,False,False
544,-0.423748,-0.377512,-0.314120,-0.297273,1.499602,-0.410106,False,False,False,True,False


### 2. Train a logistic regression model on the training data.

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [21]:
model = LogisticRegression()

model.fit(X_train, y_train)

In [22]:
y_pred = model.predict(X_test)

### 3. Evaluate your model's performance using appropriate metrics

In [23]:
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.82


The model correctly predicts the approved and denied classes with an overall accuracy of 82%. This is fairly good but can be improved.

In [24]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[54 20]
 [14 99]]


The model accurately identifies 99 out of 113 approved classes and 54 out of 74 denied classes. However, there are 20 false positives (denied classes incorrectly classified as approved) and 14 false negatives (approved classes incorrectly classified as denied).

In [25]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.73      0.76        74
           1       0.83      0.88      0.85       113

    accuracy                           0.82       187
   macro avg       0.81      0.80      0.81       187
weighted avg       0.82      0.82      0.82       187



The model correctly predicted 83% of all cases classified as approved, capturing 88% of the actual approved cases. For denied cases, it achieved a precision of 79%, indicating that 79% of instances predicted as denied were accurate, while it identified 73% of all actual denied cases.

# Multi-layer Perceptron Classifier


In [26]:
from sklearn.neural_network import MLPClassifier

In [27]:
mlp = MLPClassifier()

In [28]:
mlp.fit(X_train, y_train)



In [29]:
mlp_pred = mlp.predict(X_test)

In [30]:
# Accuracy Score
accuracy = accuracy_score(y_test, mlp_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.82


In [31]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, mlp_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[54 20]
 [14 99]]


In [32]:
# Classification Report
class_report = classification_report(y_test, mlp_pred)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.73      0.76        74
           1       0.83      0.88      0.85       113

    accuracy                           0.82       187
   macro avg       0.81      0.80      0.81       187
weighted avg       0.82      0.82      0.82       187



### Conclusion

The model demonstrates overall good performance, accurately predicting 82% of all cases. It exhibits a stronger capability in identifying approved cases compared to denied cases, as evidenced by higher precision and recall values.