In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("diabetes.csv") # this is just for reference to look at
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
X = df[["Glucose", "BloodPressure", "Insulin", "Outcome"]]
X

Unnamed: 0,Glucose,BloodPressure,Insulin,Outcome
0,148,72,0,1
1,85,66,0,0
2,183,64,0,1
3,89,66,94,0
4,137,40,168,1
...,...,...,...,...
763,101,76,180,0
764,122,70,0,0
765,121,72,112,0
766,126,60,0,1


In [4]:
y = df["Outcome"]
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

The publisher publishes records in S continuously. When it reaches the end of S, it continues to send from the beginning again.

The subscriber continuously receives the data. For each latest record(r) received, apply the 3NN classification to the last 5 records before r, and compare the classification result with the Outcome label in r.

    Repeat this for 1000 times, and report the number of correct classifications.

    Question: "apply the 3NN classification to the last 5 records before r" is ambigous, is it from 0 - (len(list) - 5) or (len(list) - 5) - len(list)
    One intepretation would only ever test with the five latest records before r, the other would take all the records - the last 5 before r
    one: 5, the other is ever increasing, and ever more acurate, though also ever more computational expensive
    I must assume it would be the later, you want to train a model on new data in addition to what you already have to improve your model

In [5]:
def calculate_3nn_with_train_test(X, y):
    # We create a 2-D array to store all accuracy values
    accuracy_row_list = []
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
        
    # Model intializing
    knn = KNeighborsClassifier(n_neighbors=3)

    # Training (very light compared to an eager learner)
    knn.fit(X_train, y_train)

    # Predicting on train and test set
    y_pred_train = knn.predict(X_train)
    y_pred_test = knn.predict(X_test)

    #print("KNN with k={}:".format(k))
        
    # Get the accuracy from metrics
    accuracy_train = accuracy_score(y_train, y_pred_train)
    #print("Training accuracy: {}".format(accuracy_train))
    accuracy_test = accuracy_score(y_test, y_pred_test)
    #print("Test accuracy: {}\r\n".format(accuracy_test))

    accuracy_row_list.append({"k": 3, "Training accuracy": accuracy_train, "Test accuracy": accuracy_test})

    return pd.DataFrame(accuracy_row_list)

In [6]:
def calculate_3nn(X, y):
    # We create a 2-D array to store all accuracy values
    accuracy_row_list = []

    # Model intializing
    knn = KNeighborsClassifier(n_neighbors=3)

    # Training (very light compared to an eager learner)
    knn.fit(X, y)
    y_pred = knn.predict(X)
        
    # Get the accuracy from metrics
    accuracy_test = accuracy_score(y, y_pred)
    print("Test accuracy: {}\r\n".format(accuracy_test))

    correct_predictions = np.sum(y == y_pred)
    print("correct prediction count: {}\r\n".format(correct_predictions))

    accuracy_row_list.append({"k": 3, "Test accuracy": accuracy_test, "Number of correct classifications": correct_predictions})

    return pd.DataFrame(accuracy_row_list)

In [7]:
import paho.mqtt.client as mqtt # you only train on 5, the five nearest neighbors
from io import StringIO

# This function defines what to do when we connect to the broker
def on_connect(client, userdata, flags, rc):
    print("Connected with result code " + str(rc))
    # We subscribe to this particular data. There may be other data published by the same subscriber or broker.
    client.subscribe("myroom_super_S/S")

# This function defines what to do when we receives a message from the brokder
def on_message(client, userdata, msg):
    global my_list_s
    if(len(my_list_s.index) < 1000):
        temp_s = msg.payload.decode('utf-8')
        #print("Received myroom_super_S/S: ", temp_s)
        
        df_from_json = pd.read_json(StringIO(temp_s), orient='index')
    
        new_row = pd.DataFrame(df_from_json)
    
        my_list_s = pd.concat([my_list_s, new_row], ignore_index=True)
    
        #print(my_list_s[-6:])
        
        if(len(my_list_s.index) > 5 and len(my_list_s.index) % 10 == 1): # train every 5 new data, could then probably just send the 5 data as one piece as well
            #print("My list: ", my_list_s)
            #print("End of list!")
    
            x = my_list_s[["Glucose", "BloodPressure", "Insulin"]]
            y = my_list_s["Outcome"]
            y = y.astype(int) # it was interpreted as a wrong type, it thought it was float but it is an int, 0 or 1
            
            result = calculate_3nn(x, y)
            global my_list_classification
            my_list_classification = pd.concat([my_list_classification, result], ignore_index=True)
            print(result)   
    else:
        global task_complete
        if(task_complete == -1):
            task_complete = 0

    if(task_complete == 0):
        task_complete = 1
        print("We are DONE")
        print()
        print("The result: ", my_list_classification)
        print("Total number of correct classifications: ", my_list_classification["Number of correct classifications"].max())
        best_row = my_list_classification[my_list_classification["Test accuracy"].max() == my_list_classification["Test accuracy"]]
        print("Best accuracy score: ", best_row)
        

# We create a client as the data subscriber and specify its actions for particular events
mqttc = mqtt.Client()
mqttc.on_connect = on_connect 
mqttc.on_message = on_message
my_list_s = pd.DataFrame(columns=["Glucose", "BloodPressure", "Insulin", "Outcome"])
my_list_classification = pd.DataFrame(columns=["k", "Test accuracy", "Number of correct classifications"])
task_complete = -1

# Now, we connect to the data broker.
mqttc.connect("mqtt.eclipseprojects.io", 1883, 60)

# As a simple example, we just keep the data listening/receiving on and on...
mqttc.loop_forever()

Connected with result code 0


  my_list_s = pd.concat([my_list_s, new_row], ignore_index=True)


Test accuracy: 0.7272727272727273

correct prediction count: 8

   k  Test accuracy  Number of correct classifications
0  3       0.727273                                  8


  my_list_classification = pd.concat([my_list_classification, result], ignore_index=True)


Test accuracy: 0.6666666666666666

correct prediction count: 14

   k  Test accuracy  Number of correct classifications
0  3       0.666667                                 14
Test accuracy: 0.7096774193548387

correct prediction count: 22

   k  Test accuracy  Number of correct classifications
0  3       0.709677                                 22
Test accuracy: 0.7560975609756098

correct prediction count: 31

   k  Test accuracy  Number of correct classifications
0  3       0.756098                                 31
Test accuracy: 0.7843137254901961

correct prediction count: 40

   k  Test accuracy  Number of correct classifications
0  3       0.784314                                 40
Test accuracy: 0.8032786885245902

correct prediction count: 49

   k  Test accuracy  Number of correct classifications
0  3       0.803279                                 49
Test accuracy: 0.7605633802816901

correct prediction count: 54

   k  Test accuracy  Number of correct classifications
0  3 

KeyboardInterrupt: 