In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression

In [5]:
data = pd.read_csv('diabetes_prediction_dataset.csv')

In [6]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [8]:
data.dtypes

gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

In [9]:
data.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
99995     True
99996    False
99997    False
99998    False
99999    False
Length: 100000, dtype: bool

In [10]:
data.duplicated().sum()

3854

In [11]:
data.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [12]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [13]:
data['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [14]:
data['heart_disease'].value_counts()

heart_disease
0    96058
1     3942
Name: count, dtype: int64

In [15]:
data['gender'].value_counts()

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [16]:
# data.hist(figsize=(8,8), color='purple')
# plt.show()

In [17]:
encoder = LabelEncoder()

data['gender'] = encoder.fit_transform(data['gender'])

data['smoking_history'] = encoder.fit_transform(data['smoking_history'])


In [18]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [19]:
X = data.drop('diabetes', axis=1)
y = data['diabetes']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
X_train.shape, X_test.shape, X.shape

((67000, 8), (33000, 8), (100000, 8))

In [22]:
lr = LogisticRegression(max_iter=3000)
lr.fit(X_train, y_train)

In [23]:
y_pred = lr.predict(X_test)

In [24]:
# plt.scatter(y_test, y_pred)
# plt.show()

In [27]:
cdf = pd.DataFrame(lr.coef_.reshape(-1), index=X.columns, columns=["Coefficient"])
cdf

Unnamed: 0,Coefficient
gender,0.243025
age,0.046474
hypertension,0.72334
heart_disease,0.709467
smoking_history,0.093997
bmi,0.087496
HbA1c_level,2.338993
blood_glucose_level,0.033422


In [28]:
print(accuracy_score(y_test, y_pred))
print('\n')
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

0.9594545454545454


[[29960   256]
 [ 1082  1702]]


              precision    recall  f1-score   support

           0       0.97      0.99      0.98     30216
           1       0.87      0.61      0.72      2784

    accuracy                           0.96     33000
   macro avg       0.92      0.80      0.85     33000
weighted avg       0.96      0.96      0.96     33000



In [29]:
test_array = np.array([[1,45,1,0,2,28.5,7.2,180]])
print(test_array)

[[  1.   45.    1.    0.    2.   28.5   7.2 180. ]]


In [30]:
pred = lr.predict(test_array)
pred = "Positive" if pred == 1 else "Negative"
print("Diabetes Prediction:", pred)

Diabetes Prediction: Positive


