<a href="https://colab.research.google.com/github/kkrish39/Machine-learning-notebooks/blob/main/Meal_or_No_Meal_Prediction_from_glucose_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import pickle
import glob
import scipy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as lm
import plotly.graph_objs as go
from sklearn.model_selection import KFold

In [2]:
def kfoldCrossValidation(trainedModel, X, y):
    cv = KFold(n_splits=5, random_state=3, shuffle=True)
    scores = []
    for train_index, test_index in cv.split(X):
        X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        trainedModel.fit(X_train, np.ravel(y_train))
        scores.append(trainedModel.score(X_test, y_test))

    print("KFold Scores",scores)
    print("Kfold Mean Score -->",np.mean(scores))

In [3]:
def classificationReport(y_test, predictedLabels):
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

    print(confusion_matrix(y_test,predictedLabels.round()))
    print(classification_report(y_test,predictedLabels.round()))
    print(accuracy_score(y_test, predictedLabels.round())) 

In [4]:
def RandomForestClassifier(X, y, X_train, X_test, y_train, y_test):
    print('***************','RandomForestClassifier','***************')
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(
        bootstrap=True,
        max_depth=5,
        max_features='sqrt',
        min_samples_leaf=4,
        min_samples_split=10,
        n_estimators=800
    )
    rf = clf.fit(X, np.ravel(y))
    score = rf.score(X_test, y_test)
    y_pred = rf.predict(X_test) 

    #Accuracy Report
    classificationReport(y_test, y_pred)               
    
    #KFold Cross Validation
    kfoldCrossValidation(rf, X, y)
    trainedModelFilename = "trainedModel.pkl"
    with open(trainedModelFilename, 'wb') as file:
        pickle.dump(rf, file)

In [5]:
def DecisionTreeClassifier(X, y, X_train, X_test, y_train, y_test):
    print('***************','DecisionTreeClassifier','***************')
    from sklearn.tree import DecisionTreeClassifier
    clf = DecisionTreeClassifier(max_depth=1)
    clf = clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    
    predictedLabels = clf.predict(X_test)
    classificationReport(y_test, predictedLabels)

    kfoldCrossValidation(clf, X, y)
    trainedModelFilename = "trainedModel.pkl"
    with open(trainedModelFilename, 'wb') as file:
        pickle.dump(clf, file)

In [6]:
def MultiLayerPerceptron(X, y, X_train, X_test, y_train, y_test):
    print('***************','MultiLayerPerceptron','***************')
    from sklearn.neural_network import MLPClassifier
    clf = MLPClassifier(hidden_layer_sizes=1000,activation='logistic',alpha=1,learning_rate='invscaling')
    clf.fit(X_train, np.ravel(y_train))
    score = clf.score(X_test, y_test)

    predictedLabels = clf.predict(X_test)
    classificationReport(y_test, predictedLabels)

    kfoldCrossValidation(clf, X, y)
    trainedModelFilename = "trainedModel.pkl"
    with open(trainedModelFilename, 'wb') as file:
        pickle.dump(clf, file)

In [7]:
def PCA(featureMatrix):
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler

    featureMatrix = StandardScaler().fit_transform(featureMatrix)
    pca = PCA(n_components=6)
    principalComponents = pca.fit(featureMatrix)
    transformedPrincipalComponent = pca.fit_transform(featureMatrix)

    finalPrincipalComponents = pd.DataFrame(data=transformedPrincipalComponent, columns = ['PC1', 'PC2','PC3', 'PC4','PC5', 'PC6'])

    return finalPrincipalComponents

In [12]:
mealDataCsv = sorted(glob.glob("./MealNoMealData/*.csv"))
cols = list(range(0,31)) #Assuming the max of columns will be 30
labelColumn = []
mealNoMealDataFrame = pd.DataFrame()

In [13]:
temp = pd.DataFrame()
    
# Constructing the dataset in such a way that there will not be bias while KFold
# 5 No-meal 5 Meal continuosly
for i in range(0, 50):
    for file in mealDataCsv:
        temp = pd.DataFrame(pd.read_csv(file, names=cols))
        mealNoMealDataFrame = mealNoMealDataFrame.append(pd.DataFrame(np.array(temp.iloc[[i]])))
mealNoMealDataFrameArray = mealNoMealDataFrame.to_numpy()

In [14]:
 # reversing the arrays
mealNoMealDataFrameArray = mealNoMealDataFrameArray[...,::-1]

#removing the first column of the array since it holds some NaN values
mealNoMealDataFrameArray = mealNoMealDataFrameArray[:,1:]
mealNoMealDataFrame = pd.DataFrame(mealNoMealDataFrameArray)

In [15]:
(row, col) = mealNoMealDataFrame.shape
    
#Appending the label rows for the given data frame. it will be arranged in the following type
labelColumn = [0,0,0,0,0,1,1,1,1,1] * (int(row/10)+1)
labelColumn = labelColumn[0:row]
mealNoMealDataFrame['label'] = labelColumn
mealNoMealDataFrameArray = mealNoMealDataFrame.to_numpy()

In [16]:
#removing null values
nanValueMapping = np.argwhere(np.isnan(mealNoMealDataFrameArray))
rowsToBeDeleted = np.unique(nanValueMapping[:,0])
mealNoMealDataFrameArray = np.delete(mealNoMealDataFrameArray,rowsToBeDeleted,0)
mealNoMealDataFrame = pd.DataFrame(mealNoMealDataFrameArray)

In [17]:
labelColumn = mealNoMealDataFrame[[30]]
mealNoMealDataFrame = mealNoMealDataFrame.drop(labels=30,axis=1)
row, col = np.shape(mealNoMealDataFrameArray)

mealNoMealDataFrameArray = mealNoMealDataFrame.to_numpy()

In [18]:
#Matrix to store all the derived features
featureMatrix = pd.DataFrame()

In [19]:
#Expanding Window Statistics
tempFrame = pd.DataFrame(mealNoMealDataFrameArray)
expandingWindow = tempFrame.expanding(min_periods=2, axis=1).mean()
featureMatrix = pd.concat([expandingWindow.mean(axis=1),expandingWindow.min(axis=1), expandingWindow.max(axis=1),
(expandingWindow.max(axis=1) - expandingWindow.min(axis=1))/2, expandingWindow.kurtosis(axis=1)], axis=1)
featureMatrix.columns = ['exp_row_mean', 'exp_row_min', 'exp_row_max', 'exp_min_max_average', 'kurtosis'] 

In [20]:
#Skewness and Standard-Deviation
skewAndStdFrame = pd.DataFrame()

tempFrame = pd.DataFrame(mealNoMealDataFrameArray)
skewAndStdFrame = pd.concat([tempFrame.skew(skipna=True, axis=1), tempFrame.std(skipna=True, axis=1)], axis=1)
skewAndStdFrame.columns = ['std','skewness']
featureMatrix = pd.concat([featureMatrix, skewAndStdFrame], axis=1)

In [21]:
#Finding moving average
cgmMovingAverage = pd.DataFrame(mealNoMealDataFrameArray)
window = 7 #for Every 35 minutes
movingAverage = pd.DataFrame()
rowVal, colVal = cgmMovingAverage.shape

for i in range(0,colVal-window,window):
  featureMatrix['mean_'+str((i/window)*35+35)+'min'] = cgmMovingAverage.iloc[:,i:i+window].mean(axis = 1)

In [22]:
#Finding Windowed velocity
window  = 6 #Taking the velocity for every half hour window
cgmWindowVelocity = pd.DataFrame()

row, col = mealNoMealDataFrameArray.shape
mealNoMealDataFrameArray = pd.DataFrame(mealNoMealDataFrameArray)
for i in range(0,col-window):
    cgmWindowVelocity['moving_velocity_'+str(i)] = mealNoMealDataFrameArray[:][i+window]-mealNoMealDataFrameArray[:][i]
featureMatrix['max_window_velocity'] = cgmWindowVelocity.max(axis = 1, skipna=True) 

In [23]:
#Entropy
def get_entropy(series):
    series_counts = series.value_counts()
    entropy = scipy.stats.entropy(series_counts)  
    return entropy

featureMatrix['Entropy'] = mealNoMealDataFrameArray.apply(lambda row: get_entropy(row), axis=1) 
featureMatrix.head()

Unnamed: 0,exp_row_mean,exp_row_min,exp_row_max,exp_min_max_average,kurtosis,std,skewness,mean_35.0min,mean_70.0min,mean_105.0min,mean_140.0min,max_window_velocity,Entropy
0,120.950146,100.5,152.866667,26.183333,-1.289162,0.125173,42.233124,101.0,126.428571,164.0,201.428571,39.0,3.123939
1,133.969924,117.5,142.766667,12.633333,-1.083193,-0.098956,14.075372,123.714286,152.0,143.285714,149.857143,42.0,2.921658
2,282.762645,275.333333,290.0,7.333333,-0.776861,-0.325407,11.033908,284.857143,284.142857,273.571429,265.0,10.0,2.811796
3,124.90127,111.0,133.366667,11.183333,0.04457,-0.201047,10.720536,120.571429,127.285714,136.0,144.714286,16.0,3.042845
4,139.395841,136.090909,147.033333,5.471212,0.342019,1.053647,12.968086,136.857143,137.285714,144.857143,161.714286,28.0,2.794355


In [24]:
#Removed few features which eventually increases accuracy
featureMatrix=featureMatrix.drop('mean_140.0min',axis=1)
featureMatrix=featureMatrix.drop('exp_min_max_average',axis=1)
featureMatrix=featureMatrix.drop('mean_105.0min',axis=1)

In [25]:
#removing Nan row values if any
featureMatrix = featureMatrix.dropna()

y = labelColumn
X = featureMatrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
#RandomForestClassifier
RandomForestClassifier(X,y,X_train,X_test,y_train,y_test)

*************** RandomForestClassifier ***************
[[34  6]
 [ 8 38]]
              precision    recall  f1-score   support

         0.0       0.81      0.85      0.83        40
         1.0       0.86      0.83      0.84        46

    accuracy                           0.84        86
   macro avg       0.84      0.84      0.84        86
weighted avg       0.84      0.84      0.84        86

0.8372093023255814
KFold Scores [0.5813953488372093, 0.6941176470588235, 0.611764705882353, 0.6470588235294118, 0.7647058823529411]
Kfold Mean Score --> 0.6598084815321477


In [27]:
#DecisionTreeClassifier
DecisionTreeClassifier(X,y,X_train,X_test,y_train,y_test)

*************** DecisionTreeClassifier ***************
[[21 19]
 [16 30]]
              precision    recall  f1-score   support

         0.0       0.57      0.53      0.55        40
         1.0       0.61      0.65      0.63        46

    accuracy                           0.59        86
   macro avg       0.59      0.59      0.59        86
weighted avg       0.59      0.59      0.59        86

0.5930232558139535
KFold Scores [0.6046511627906976, 0.611764705882353, 0.611764705882353, 0.611764705882353, 0.7058823529411765]
Kfold Mean Score --> 0.6291655266757866


In [28]:
#MultiLayerPerceptron
MultiLayerPerceptron(X,y,X_train,X_test,y_train,y_test)

*************** MultiLayerPerceptron ***************
[[25 15]
 [17 29]]
              precision    recall  f1-score   support

         0.0       0.60      0.62      0.61        40
         1.0       0.66      0.63      0.64        46

    accuracy                           0.63        86
   macro avg       0.63      0.63      0.63        86
weighted avg       0.63      0.63      0.63        86

0.627906976744186
KFold Scores [0.5813953488372093, 0.6823529411764706, 0.5647058823529412, 0.611764705882353, 0.7411764705882353]
Kfold Mean Score --> 0.6362790697674419
