<a href="https://colab.research.google.com/github/jhuarancca/ASU_DataMining/blob/main/DataMininProyecto03_ver02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np  
import math

from scipy.stats import skew, kurtosis, entropy
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics.cluster import contingency_matrix

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import cluster, silhouette_score, v_measure_score, adjusted_rand_score, completeness_score
from sklearn.preprocessing import StandardScaler

def calculatePurity(groundTruth, labels):
    # compute contingency matrix (also called confusion matrix)
    cont_mat = contingency_matrix(groundTruth, labels)
    # return purity
    return np.sum(np.amax(cont_mat, axis=0)) / np.sum(cont_mat)


def calculateSSE(X, labels,n):
    sse = 0
    for i in range(n):
        cluster = X[labels == i]
        mean = cluster.mean(axis=0)
        sse += ((cluster - mean) ** 2).sum()
    return sse


def calculateEntropy(groundTruth, labels, n):
    ent = 0
    for i in range(n):
        bincount = np.bincount(groundTruth[labels == i])
        w = (labels == i).sum()
        probabilities = bincount[bincount != 0] / w
        ent += entropy(probabilities) * w / labels.shape[0]
    return ent


bins = {1:[0,20],2:[21,40],3:[41,60],4:[61,80],5:[81,100],6:[101,120]}

def getBin(mealAmountGrams):
    lint_bin = 0
    li_binNumber =1
    for bin_number, bin_range in bins.items():
        if bin_range[0]<=mealAmountGrams<=bin_range[1]:
            li_binNumber = bin_number
    return li_binNumber

dfIns = pd.read_csv ('InsulinData.csv', low_memory=False)
dfCGM = pd.read_csv ('CGMData.csv', low_memory=False)


dfIns['Datetime']=pd.to_datetime(dfIns['Date'] + ' ' + dfIns['Time'])
dfCGM['Datetime']=pd.to_datetime(dfCGM['Date'] + ' ' + dfCGM['Time'])
dfIns.rename(columns={"BWZ Carb Input (grams)": "Carb"},inplace=True)

dfMeals=dfIns
dfMeals=dfMeals.drop(dfMeals[dfMeals['Carb'].isna()].index)
dfMeals = dfMeals.drop(dfMeals[(dfMeals['Carb']==0.0)].index)
dfMeals=dfMeals[["Datetime", "Carb"]]
dfMeals=dfMeals.reset_index(drop=True)
dfMeals = dfMeals.sort_values(by=['Datetime'], ascending=True)
dfMeals['Delete']='N'


dfMealsTmp=dfMeals
dfMealsTmp = dfMealsTmp.sort_values(by=['Datetime'], ascending=True)
dfMealsTmp=dfMealsTmp.reset_index(drop=True)

for index, row in dfMealsTmp.iterrows():
   ld_DateTime1=dfMealsTmp.at[index, 'Datetime']
   
   if index<(len(dfMealsTmp.index)-1):
    #print(index,len(dfMeals.index))
    ld_DateTime2=dfMealsTmp.at[index+1,'Datetime']
    ld_DateTime3=ld_DateTime1  + timedelta(hours=2)
    if ld_DateTime2<=ld_DateTime3:
      dfMeals.at[index,'Delete']='Y'
      #print(index,row['Datetime'], row['Carb'],ld_DateTime2,ld_DateTime3)
dfMeals01=dfMeals[(dfMeals['Delete']=='N')]
dfMeals01=dfMeals01.reset_index(drop=True)

maxMeal = max(dfMeals01['Carb'])
minMeal = min(dfMeals01['Carb'])

print("Max meal value (grams):", maxMeal)
print("Min meal value (grams):", minMeal)

binsNumber = math.ceil((maxMeal-minMeal)/20)
print("In total you should have N = (",maxMeal,"-",minMeal,"/ 20) i.e.", binsNumber, "bins")

scaler = StandardScaler()
dfMeals01["CarbScaled"] = scaler.fit_transform(dfMeals01['Carb'].values.reshape(-1,1))

dfMeals01['GroundTruth'] = dfMeals01['Carb'].apply(lambda x: getBin(x)) 
#dfMeals01["GroundTruth"].value_counts().sort_values(ascending=True).plot(kind="barh")

X = dfMeals01["CarbScaled"].values.reshape(-1,1)

kmeans = KMeans(n_clusters=5, random_state=42, max_iter=100)
kmeans_model = kmeans.fit(X)
kmeans_silhouette = silhouette_score(X, kmeans.labels_).round(2)

# Create a column of K-means Prediction
dfMeals01['KmeanCluster'] = kmeans_model.predict(X)
# Calculate accuracy using entropy, purity_score, & v-measure score
kmeanEntropy = calculateEntropy(dfMeals01['GroundTruth'],dfMeals01['KmeanCluster'],binsNumber)
kmeanPurity = calculatePurity(dfMeals01['GroundTruth'],dfMeals01['KmeanCluster'])
kmeanSSE = calculateSSE(X, kmeans.labels_,binsNumber)

# Define a model
dbscan = DBSCAN(eps=0.3)
dbscan_model = dbscan.fit(X)
dbscan_silhouette = silhouette_score(X, dbscan_model.labels_).round(2)
dfMeals01['DBSCAN_Cluster'] = dbscan_model.fit_predict(X)

# Calculate accuracy using entropy, purity_score, & v-measure score
dbscanEntropy = calculateEntropy(dfMeals01['GroundTruth'],dfMeals01['DBSCAN_Cluster'],binsNumber)
dbscanPurity = calculatePurity(dfMeals01['GroundTruth'],dfMeals01['DBSCAN_Cluster'])
dbscanSSE = calculateSSE(X, dbscan.labels_,binsNumber)

#dsResult = {}
#dsResult['SSE for Kmeans'] =  "%.2f"%kmeanSSE
#dsResult['SSE for DBSCAN'] =  "%.2f"%dbscanSSE
#dsResult['Entropy for Kmeans'] =  "%.2f"%kmeanEntropy
#dsResult['Entropy for DBSCAN'] =  "%.2f"%dbscanEntropy
#dsResult['Purity for K means'] =  "%.2f"%kmeanPurity
#dsResult['Purity for DBSCAN'] =  "%.2f"%dbscanPurity

#dsResult = pd.DataFrame(dsResult, index=[0])
#dsResult.to_csv('Result.csv',index=False)
#dsResult
###
#Report your accuracy of clustering based on SSE, entropy and purity metrics.
cols = ['SSE Kmeans', 'SSE DBSCAN', 'Entropy Kmeans','Entropy DBSCAN', 'Purity Kmeans', 'Purity DBSCAN']
rows = [[kmeanSSE, dbscanSSE,kmeanEntropy, dbscanEntropy,kmeanPurity, dbscanPurity]]

df = pd.DataFrame(rows, columns=cols)
df.to_csv('Result.csv',index=False, header=False)



Max meal value (grams): 122.0
Min meal value (grams): 3.0
In total you should have N = ( 122.0 - 3.0 / 20) i.e. 6 bins


  ret, rcount, out=ret, casting='unsafe', subok=False)
  ret, rcount, out=ret, casting='unsafe', subok=False)
