In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('diabetes_prediction_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
df.shape

(100000, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [5]:
for obj_col in df[['gender','smoking_history']]:
    print(df[obj_col].unique())

['Female' 'Male' 'Other']
['never' 'No Info' 'current' 'former' 'ever' 'not current']


In [6]:
from sklearn.preprocessing import LabelEncoder
le_columns = ['gender','smoking_history']
le = LabelEncoder()
for cols in df[le_columns]:
    df[cols] = le.fit_transform(df[cols])  
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [7]:
x = df.drop(columns=['diabetes'])
y = df['diabetes']

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20,random_state=42)

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [10]:
x_train = scaler.fit_transform(x_train)

In [11]:
x_test = scaler.transform(x_test)

In [12]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression()

In [13]:
regressor.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [14]:
regressor.score(x_test,y_test)

0.95865

In [15]:
regressor.predict(x_test)

array([0, 0, 0, ..., 0, 0, 0], shape=(20000,))

In [16]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [17]:
df['smoking_history'].value_counts()

smoking_history
0    35816
4    35095
3     9352
1     9286
5     6447
2     4004
Name: count, dtype: int64

In [18]:
pred_data = pd.DataFrame([['0','54','0','1','4','20.14','5.0','80']],
                        columns=['gender','age','hypertension','heart_disease','smoking_history','bmi','HbA1c_level','blood_glucose_level'])
pred_data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,0,54,0,1,4,20.14,5.0,80


In [19]:
scaler.transform(pred_data)

array([[-0.84221718,  0.53418034, -0.28630923,  4.9125881 ,  0.96268037,
        -1.08636167, -0.4915737 , -1.42515661]])

In [20]:
regressor.predict(pred_data)



array([1])

In [22]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
print("Accuracy:", acc)

Accuracy: 0.97035


In [21]:
import pickle as pk

pk.dump(regressor,open("regressor.pkl",'wb'))
pk.dump(scaler,open("scaler.pkl",'wb'))