# **Binary Classification with Both Algorithms**

# **Importing Libraries**

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# **Loading The Dataset**

In [4]:
diabetes = pd.read_csv (r"C:\Users\bbuser\Desktop\NumPy\DataScience-Brain-Bytes\from_deena\data\diabetes.csv")
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# **Exploring The Dataset**

In [10]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [13]:
# Outcome = 0 (No Diabetes), 1 (Diabetes)
diabetes['Outcome'].value_counts() # Target column

Outcome
0    500
1    268
Name: count, dtype: int64

# **Split Features & Labels**

In [17]:
X = diabetes.drop('Outcome', axis=1)  # features
y = diabetes['Outcome']               # target label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# **Train Logistic Regression**

In [20]:
logistic_regression = LogisticRegression(max_iter=500)
logistic_regression.fit(X_train, y_train)
y_predicted_logistic = logistic_regression.predict(X_test)

# **Train K-Nearest Neighbors**

In [21]:
knn = KNeighborsClassifier(n_neighbors=5)  # try k=5
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# **Evaluate Both Models**

In [23]:
# Logistic Regression Metrics
print("Logistic Regression Results:")
print(classification_report(y_test, y_predicted_logistic))

print("--------------------------------------------------------\n")

# KNN Metrics
print("KNN Results:")
print(classification_report(y_test, y_pred_knn))

Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.76      0.82      0.79       100
           1       0.61      0.52      0.56        54

    accuracy                           0.71       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.71      0.71      0.71       154

--------------------------------------------------------

KNN Results:
              precision    recall  f1-score   support

           0       0.73      0.77      0.75       100
           1       0.53      0.48      0.50        54

    accuracy                           0.67       154
   macro avg       0.63      0.63      0.63       154
weighted avg       0.66      0.67      0.66       154

