### Implement logistic regression for binary classification.

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# Load the dataset
data = pd.read_csv('Dataset/diabetes_prediction_dataset.csv')
print(data.head())
data['smoking_history'] = data['smoking_history'].map({'ever':0, 'never': 1, 'former': 2, 'current': 3, 'not current':4, 'No Info': 5})
print('Null Values\n',data.isnull().sum())

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
Null Values
 gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [3]:
# Extract feature and label
X = data.iloc[:, 2:7].values
y = data.iloc[:, -1].values

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train

array([[ 0.  ,  0.  ,  2.  , 24.77,  3.5 ],
       [ 0.  ,  0.  ,  1.  , 24.6 ,  5.7 ],
       [ 0.  ,  0.  ,  0.  , 24.33,  4.  ],
       ...,
       [ 0.  ,  0.  ,  1.  , 26.14,  5.8 ],
       [ 0.  ,  0.  ,  1.  , 24.96,  6.2 ],
       [ 0.  ,  0.  ,  1.  , 27.99,  5.  ]])

In [6]:
y_train

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [7]:
X_test

array([[ 0.  ,  0.  ,  5.  , 20.82,  5.8 ],
       [ 0.  ,  0.  ,  5.  , 21.  ,  5.  ],
       [ 0.  ,  0.  ,  2.  , 25.32,  3.5 ],
       ...,
       [ 0.  ,  0.  ,  1.  , 26.51,  4.8 ],
       [ 0.  ,  1.  ,  5.  , 27.32,  6.6 ],
       [ 0.  ,  0.  ,  1.  , 23.86,  5.8 ]])

In [8]:
y_test

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [13]:
# Create an instance of the logistic regression model
model = LogisticRegression()

# Train the model using the training data
model.fit(X_train, y_train)

LogisticRegression()

In [14]:
# Make predictions on the testing set
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [15]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print("\nAccuracy:", accuracy)
print("\nPrecision:", precision)
print("\nRecall:", recall)
print("\nF1-score:", f1)
print("\nConfusion Matrix:\n", cm)
print()


Accuracy: 0.9432

Precision: 0.848780487804878

Recall: 0.40749414519906324

F1-score: 0.5506329113924051

Confusion Matrix:
 [[18168   124]
 [ 1012   696]]

