In [2]:
import numpy as np
import pandas as pd
import requests
from sklearn.neighbors import KNeighborsClassifier

DEV_KEY = "M.L. - Maromba Learning"
URL = "https://aydanomachado.com/mlclass/01_Preprocessing.php"

data_train = pd.read_csv("diabetes_dataset.csv")
data_app = pd.read_csv("diabetes_app.csv")

feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

def showNulls(data):
  print(data.isnull().sum())


def getDataFrameFromArray(arr):
  l, c = arr.shape
  df = None
  if c == 8:
    df = pd.DataFrame(arr, columns=feature_cols)
    df["Outcome"] = data_train["Outcome"]
  else:
    df = pd.DataFrame(arr, columns=data_train.columns)
  df["Outcome"] = df["Outcome"].astype(int)
  return df

In [3]:
def getKNNModel(data):
  X = data[feature_cols]
  y = data.Outcome

  neigh = KNeighborsClassifier(n_neighbors=3)
  neigh.fit(X, y)

  return neigh

def getPrediction(knn):
  y_pred = knn.predict(data_app[feature_cols])
  return y_pred

def sendData(y_pred):
  data = {'dev_key':DEV_KEY,
        'predictions':pd.Series(y_pred).to_json(orient='values')}
  r = requests.post(url = URL, data = data)
  pastebin_url = r.text
  print(" - Resposta do servidor:\n", r.text, "\n")


def sendToProfessor(data):
  knn = getKNNModel(data)
  y_pred = getPrediction(knn)
  sendData(y_pred)



## Verify null values

In [None]:
data_train.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

## Approach 1: drop null values

In [None]:
not_null = data_train.dropna()

print(not_null.isnull().sum())

sendToProfessor(not_null)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
 - Resposta do servidor:
 {"status":"success","dev_key":"M.L. - Maromba Learning","accuracy":0.5612244897959183,"old_accuracy":0.63265306122449} 



## Approach 2: fill null values

### Fill with 0

In [None]:
zero_filled = data_train.fillna(0)

sendToProfessor(zero_filled)

 - Resposta do servidor:
 {"status":"success","dev_key":"M.L. - Maromba Learning","accuracy":0.5561224489795918,"old_accuracy":0.63265306122449} 



### Forward fill

In [None]:
# this propagates the previous value forward
forward_fill = data_train.fillna(method="ffill")

showNulls(forward_fill)


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     3
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


The first 3 insuline values are null, so we will use  back-fill to fill them

In [None]:
forward_fill = forward_fill.fillna(method="bfill")
showNulls(forward_fill)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [None]:
sendToProfessor(forward_fill)

 - Resposta do servidor:
 {"status":"success","dev_key":"M.L. - Maromba Learning","accuracy":0.5510204081632653,"old_accuracy":0.63265306122449} 



### Back-fill

In [None]:
back_fill = data_train.fillna(method="bfill")

showNulls(back_fill)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     2
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


The last two values of Insulin are null, so we will forward-fill them

In [None]:
back_fill = back_fill.fillna(method="ffill")

showNulls(back_fill)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [None]:
sendToProfessor(back_fill)

 - Resposta do servidor:
 {"status":"success","dev_key":"M.L. - Maromba Learning","accuracy":0.576530612244898,"old_accuracy":0.63265306122449} 



## Approach 3: Imputation

In [6]:
from sklearn.impute import SimpleImputer

### Replace null with mean

In [7]:
imp_mean = SimpleImputer(strategy="mean")
imputed_mean = imp_mean.fit_transform(data_train)
data_imputed_mean = getDataFrameFromArray(imputed_mean)

In [None]:
sendToProfessor(data_imputed_mean)

 - Resposta do servidor:
 {"status":"success","dev_key":"M.L. - Maromba Learning","accuracy":0.5714285714285714,"old_accuracy":0.63265306122449} 



### Replace with median

In [None]:
imp_median = SimpleImputer(strategy="median")
imputed_median = imp_median.fit_transform(data_train)
data_imputed_median = getDataFrameFromArray(imputed_median)

In [None]:
sendToProfessor(data_imputed_median)

 - Resposta do servidor:
 {"status":"success","dev_key":"M.L. - Maromba Learning","accuracy":0.576530612244898,"old_accuracy":0.63265306122449} 



### Replace with the most frequent value

In [8]:
imp_mostFrequent = SimpleImputer(strategy="most_frequent")
imputed_mf = imp_mostFrequent.fit_transform(data_train)
data_imputed_mf = getDataFrameFromArray(imputed_mf)

In [None]:
sendToProfessor(data_imputed_mf)

 - Resposta do servidor:
 {"status":"success","dev_key":"M.L. - Maromba Learning","accuracy":0.5867346938775511,"old_accuracy":0.63265306122449} 



## Approach 4: Scalling

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(data_imputed_mf[feature_cols])

scaled_df = getDataFrameFromArray(scaled)
scaled_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.670968,0.456522,0.304348,0.122253,0.393862,0.243892,0.483333,1
1,0.058824,0.264516,0.391304,0.239130,0.122253,0.214834,0.121279,0.166667,0
2,0.470588,0.896774,0.369565,0.271739,0.122253,0.130435,0.263883,0.183333,1
3,0.000000,0.600000,0.108696,0.304348,0.208791,0.636829,0.981786,0.200000,1
4,0.294118,0.464516,0.478261,0.271739,0.122253,0.189258,0.054642,0.150000,0
...,...,...,...,...,...,...,...,...,...
567,0.529412,0.290323,0.347826,0.271739,0.122253,0.109974,0.028432,0.200000,0
568,0.117647,0.503226,0.434783,0.217391,0.122253,0.475703,0.116393,0.100000,0
569,0.294118,0.496774,0.456522,0.173913,0.131868,0.204604,0.074189,0.150000,0
570,0.058824,0.529032,0.326087,0.271739,0.122253,0.304348,0.120391,0.433333,1


In [10]:
sendToProfessor(scaled_df)

 - Resposta do servidor:
 {"status":"success","dev_key":"M.L. - Maromba Learning","accuracy":0.6428571428571429,"old_accuracy":0.63265306122449} 



## Approach 5: Normalization

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled = scaler.fit_transform(data_imputed_mf[feature_cols])
scaled_df = getDataFrameFromArray(scaled)

In [14]:
sendToProfessor(scaled_df)

 - Resposta do servidor:
 {"status":"success","dev_key":"M.L. - Maromba Learning","accuracy":0.6428571428571429,"old_accuracy":0.64285714285714} 

