# Diabetes Detection
## Author: Jainam Shah
### Algorithms: Random Forest, Logistic Regression, kNN

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,blood pressure,skin thickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1.0
1,,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
2,,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
3,,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                0 non-null      float64
 1   Pregnancies               768 non-null    float64
 2   Glucose                   768 non-null    float64
 3   blood pressure            768 non-null    float64
 4   skin thickness            768 non-null    float64
 5   Insulin                   768 non-null    float64
 6   BMI                       768 non-null    float64
 7   DiabetesPedigreeFunction  768 non-null    float64
 8   Age                       768 non-null    float64
 9   Outcome                   768 non-null    float64
dtypes: float64(10)
memory usage: 60.2 KB


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,blood pressure,skin thickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,0.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)
df.head()

Unnamed: 0,Pregnancies,Glucose,blood pressure,skin thickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


In [6]:
df.isnull().sum()

Pregnancies                 1
Glucose                     1
blood pressure              1
skin thickness              1
Insulin                     1
BMI                         1
DiabetesPedigreeFunction    1
Age                         1
Outcome                     1
dtype: int64

In [7]:
dfc = df.dropna()
dfc

Unnamed: 0,Pregnancies,Glucose,blood pressure,skin thickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0.0
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,0.0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0.0
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,1.0


In [8]:
dfc.isnull().sum()

Pregnancies                 0
Glucose                     0
blood pressure              0
skin thickness              0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
X = dfc.iloc[:,:-1]
y = dfc.iloc[:,-1]

# Machine learning Modelling

#### Splitting of Data

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=25, random_state=0)

### Random Forest Classification

In [11]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=6, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score
acc_rfc = round(accuracy_score(y_pred, y_test), 2)*100
print("accuracy of rfc: ", acc_rfc)

accuracy of rfc:  88.0


### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, r2_score, classification_report
logreg = LogisticRegression(solver = 'lbfgs',max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
acc_lr = round(accuracy_score(y_pred, y_test), 2)*100
print("Accuracy of Log Reg classifier is: ",acc_lr)

Accuracy of Log Reg classifier is:  96.0


### kNN Classification

In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn.metrics import accuracy_score
acc_knn = round(accuracy_score(y_pred, y_test), 2)*100
print("Accuracy score of KNN is: ",acc_knn)

Accuracy score of KNN is:  88.0
