In [119]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier

%matplotlib inline
# sns.set(rc={'figure.figsize': [18, 18]}, font_scale=1.4)

In [120]:
df = pd.read_csv('./diabetes_data_upload.csv')

In [121]:
df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative


In [122]:
df.columns

Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')

In [123]:
df.duplicated().value_counts()

True     269
False    251
dtype: int64

In [124]:
df.drop(columns=['Age']).duplicated().value_counts()

True     305
False    215
dtype: int64

In [125]:
df.drop(columns=['Age', 'class']).duplicated().value_counts()

True     306
False    214
dtype: int64

In [126]:
cols = df.columns
cols

for col in cols:
    print(col)
    print(df[col].value_counts())
    print("")
    print("")

Age
35    30
48    28
30    25
43    25
40    24
55    22
47    21
38    20
53    20
45    18
58    18
50    18
54    16
39    16
57    15
60    15
68    10
28     9
42     9
72     9
66     9
36     8
61     8
56     8
46     8
67     8
62     7
37     7
49     7
44     7
27     6
34     6
65     6
32     5
69     5
51     5
70     5
64     5
33     4
59     4
41     4
52     4
63     3
31     3
90     2
85     2
25     2
79     1
16     1
29     1
26     1
Name: Age, dtype: int64


Gender
Male      328
Female    192
Name: Gender, dtype: int64


Polyuria
No     262
Yes    258
Name: Polyuria, dtype: int64


Polydipsia
No     287
Yes    233
Name: Polydipsia, dtype: int64


sudden weight loss
No     303
Yes    217
Name: sudden weight loss, dtype: int64


weakness
Yes    305
No     215
Name: weakness, dtype: int64


Polyphagia
No     283
Yes    237
Name: Polyphagia, dtype: int64


Genital thrush
No     404
Yes    116
Name: Genital thrush, dtype: int64


visual blurring
No     287
Yes    2

In [127]:
cols = df.columns
cols

for col in cols:
    print(col)
    print(df[col].isna().value_counts())
    print("")
    print("")

Age
False    520
Name: Age, dtype: int64


Gender
False    520
Name: Gender, dtype: int64


Polyuria
False    520
Name: Polyuria, dtype: int64


Polydipsia
False    520
Name: Polydipsia, dtype: int64


sudden weight loss
False    520
Name: sudden weight loss, dtype: int64


weakness
False    520
Name: weakness, dtype: int64


Polyphagia
False    520
Name: Polyphagia, dtype: int64


Genital thrush
False    520
Name: Genital thrush, dtype: int64


visual blurring
False    520
Name: visual blurring, dtype: int64


Itching
False    520
Name: Itching, dtype: int64


Irritability
False    520
Name: Irritability, dtype: int64


delayed healing
False    520
Name: delayed healing, dtype: int64


partial paresis
False    520
Name: partial paresis, dtype: int64


muscle stiffness
False    520
Name: muscle stiffness, dtype: int64


Alopecia
False    520
Name: Alopecia, dtype: int64


Obesity
False    520
Name: Obesity, dtype: int64


class
False    520
Name: class, dtype: int64




In [128]:
gender_map = {'Male': 0, 'Female': 1}
df['Gender'] = df['Gender'].map(gender_map)

In [129]:
age_map = {'Positive': 1, 'Negative': 0}
df['class'] = df['class'].map(age_map)

 

In [130]:
for col in df.columns:
    df[col] = df[col].replace({'No': 0, 'Yes': 1})

### Split

In [131]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Models

### KNN

In [133]:
df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,0,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,1,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,48,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,58,1,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,32,1,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


In [134]:
from sklearn.neighbors import KNeighborsClassifier

In [135]:
mrange = np.arange(1,100,2)
mrange

array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33,
       35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
       69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99])

In [136]:
knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': mrange,
}

In [137]:
grid_search_kn = GridSearchCV(knn, param_grid, cv=5)
grid_search_kn.fit(X_train, y_train)

In [138]:
print(f"Best Parameters: {grid_search_kn.best_params_} with best score {grid_search_kn.best_score_}")

Best Parameters: {'n_neighbors': 1} with best score 0.9066590563165906


In [139]:
knn = KNeighborsClassifier(n_neighbors = 1)
nb = knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test,y_pred)*100

93.58974358974359