In [92]:
import numpy as np
from numpy import nan
import pandas as pd
from pandas import DataFrame
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
names = ["graviditeter", "glukose", "blodtrykk", "hud", "insulin", "bmi", "diabetes-arv", "alder", "diabetes-diagnose"]


In [66]:
# Reading dataset and looking at shape of dataset
df = pd.read_csv("diabetes.csv", names=names)

print(df.shape)

(768, 9)


In [67]:
df.head()

Unnamed: 0,graviditeter,glukose,blodtrykk,hud,insulin,bmi,diabetes-arv,alder,diabetes-diagnose
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [68]:
num_missing = (df[["glukose","blodtrykk","hud","insulin","bmi", "diabetes-arv", "alder"]] == 0).sum()
print(num_missing)

glukose           5
blodtrykk        35
hud             227
insulin         374
bmi              11
diabetes-arv      0
alder             0
dtype: int64


In [69]:
df[["glukose","blodtrykk","hud","insulin","bmi", "diabetes-arv", "alder"]] = df[["glukose","blodtrykk","hud","insulin","bmi", "diabetes-arv", "alder"]].replace(0, nan)

In [70]:
df.dropna(inplace=True)
print(df.shape)

(392, 9)


In [71]:
df.head()

Unnamed: 0,graviditeter,glukose,blodtrykk,hud,insulin,bmi,diabetes-arv,alder,diabetes-diagnose
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
13,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1


In [72]:
dataset = DataFrame(df)

In [73]:
print(dataset)

     graviditeter  glukose  blodtrykk   hud  insulin   bmi  diabetes-arv  \
3               1     89.0       66.0  23.0     94.0  28.1         0.167   
4               0    137.0       40.0  35.0    168.0  43.1         2.288   
6               3     78.0       50.0  32.0     88.0  31.0         0.248   
8               2    197.0       70.0  45.0    543.0  30.5         0.158   
13              1    189.0       60.0  23.0    846.0  30.1         0.398   
..            ...      ...        ...   ...      ...   ...           ...   
753             0    181.0       88.0  44.0    510.0  43.3         0.222   
755             1    128.0       88.0  39.0    110.0  36.5         1.057   
760             2     88.0       58.0  26.0     16.0  28.4         0.766   
763            10    101.0       76.0  48.0    180.0  32.9         0.171   
765             5    121.0       72.0  23.0    112.0  26.2         0.245   

     alder  diabetes-diagnose  
3       21                  0  
4       33             

In [74]:
y = df["diabetes-diagnose"]
x = df.drop(columns = ["diabetes-diagnose"])

In [75]:
y2 = df["diabetes-diagnose"]
x2 = df.drop(columns = ["diabetes-diagnose"])

In [76]:
scaler = MinMaxScaler()
scaler.fit(x)
x = scaler.transform(x)

In [77]:
x.shape , y.shape

((392, 8), (392,))

In [78]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 2)
print(x_train)
print(y_train)

[[0.05882353 0.52816901 0.46511628 ... 0.11247444 0.13019272 0.        ]
 [0.17647059 0.17605634 0.72093023 ... 0.19018405 0.09464668 0.01666667]
 [0.11764706 0.99295775 0.53488372 ... 0.25153374 0.03126338 0.53333333]
 ...
 [0.52941176 0.70422535 0.72093023 ... 0.32924335 0.47280514 0.35      ]
 [0.17647059 0.22535211 0.39534884 ... 0.13496933 0.07794433 0.01666667]
 [0.         0.76760563 0.60465116 ... 0.60736196 0.0745182  0.08333333]]
392    0
368    0
8      1
348    0
507    0
      ..
575    0
52     0
152    1
32     0
335    0
Name: diabetes-diagnose, Length: 313, dtype: int64


# KNN Regression

In [79]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(x_train, y_train)

KNeighborsRegressor()

In [80]:
y_pred = regressor.predict(x_test)

In [81]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'mae: {mae}')
print(f'mse: {mse}')
print(f'rmse: {rmse}')

mae: 0.2683544303797468
mse: 0.15088607594936707
rmse: 0.3884405693917244


In [93]:
accuracy_score(y_test, y_pred)

0.7088607594936709

# Naive Bayes

In [83]:
def transform_digit(x):
    if x <= 0.25:
        return 0
    elif x <= 0.50:
        return 1
    elif x <= 0.75:
        return 2
    else:
        return 3
x2_train_transformed = x2_train.transform([transform_digit])
x2_test_transformed = x2_test.transform([transform_digit])
x2_test_transformed

Unnamed: 0_level_0,graviditeter,glukose,blodtrykk,hud,insulin,bmi,diabetes-arv,alder
Unnamed: 0_level_1,transform_digit,transform_digit,transform_digit,transform_digit,transform_digit,transform_digit,transform_digit,transform_digit
120,0,3,3,3,3,3,3,3
147,3,3,3,3,3,3,3,3
609,3,3,3,3,3,3,0,3
360,3,3,3,3,3,3,2,3
597,3,3,3,3,3,3,2,3
...,...,...,...,...,...,...,...,...
369,3,3,3,3,3,3,0,3
591,3,3,3,3,3,3,0,3
136,0,3,3,3,3,3,2,3
704,3,3,3,3,3,3,0,3


In [84]:
from sklearn.model_selection import train_test_split
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, train_size = 0.8, test_size = 0.2, random_state = 2)


In [85]:
from sklearn.naive_bayes import CategoricalNB
from sklearn import metrics
clf = CategoricalNB(min_categories=4)
clf.fit(x2_train_transformed, y_train)
y_pred = clf.predict(x2_test_transformed)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.7088607594936709
[[55  0]
 [23  1]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        55
           1       1.00      0.04      0.08        24

    accuracy                           0.71        79
   macro avg       0.85      0.52      0.45        79
weighted avg       0.79      0.71      0.60        79



# Logistic Regression

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

model = LogisticRegression(solver='liblinear', random_state=0).fit(x_test, y_test)

In [87]:
model.intercept_
model.coef_

array([[ 0.6306049 ,  1.27000425, -0.13285411,  0.51202067,  0.01254295,
         0.13389507,  0.35099756,  0.9717137 ]])

In [88]:
model.predict(x_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [89]:
model.score(x_test, y_test)

0.7215189873417721