# Blood Glucose assigment
## By Johan Challita, Tidaa

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from math import sqrt

In [2]:
#Initialize Data to a DataFrame
data = pd.DataFrame([])
for i in range(70):
    if(i < 9):
        file_nr = f"0{i+1}"
    else:
        file_nr = i+1
        
    diabetes_file_path = f'/Users/user/Dropbox/MachineLearning/lab/Diabetes-data/data-{file_nr}'
    diabetes_data = pd.read_csv(diabetes_file_path, sep='\t', names=['Date','Time','Code','Value'])
    data = data.append(pd.DataFrame(data=diabetes_data).drop_duplicates())
    
df = pd.DataFrame(data)
df

Unnamed: 0,Date,Time,Code,Value
0,04-21-1991,9:09,58,100
1,04-21-1991,9:09,33,9
2,04-21-1991,9:09,34,13
3,04-21-1991,17:08,62,119
4,04-21-1991,17:08,33,7
...,...,...,...,...
336,05-09-1989,08:00,33,1.0
337,05-09-1989,08:00,34,7.0
338,05-10-1989,08:00,34,7.0
339,05-11-1989,08:00,34,7.0


# Check if there are existing code that shouldn't exist

In [3]:
# Fix the date so it shows yyyy-mm-dd and has DateTime as dtypes
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [4]:
# Clean off 0Hi values from Value column
df = df[df.Value!='0Hi']

In [5]:
# Clean off 0Lo values from Value column
df = df[df.Value!='0Lo']

In [6]:
# Drop all nan values
df = df.dropna()

In [7]:
# Clean off 3A and 0'' values from Value column
df = df[df.Value!='3A']
df = df[df.Value!="0''"]

In [8]:
df['Value'] = df['Value'].astype(int)

In [9]:
df.dtypes

Date     datetime64[ns]
Time             object
Code              int64
Value             int64
dtype: object

In [10]:
# Clean off non-time 
df = df[df.Time != "9"]
df = df[df.Time != "006"]
df = df[df.Time != "016"]
df = df[df.Time != "004"]
df = df[df.Time != "018"]
df = df[df.Time != "018"]
df = df[df.Time != "020"]
df = df[df.Time != "56:35"]
df = df[df.Time != "188:00"]

In [11]:
# Change time to hours in order to calculate it in the model
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.hour

In [12]:
# Check if there are nulls
df.isnull().sum().sum()

0

In [13]:
# Check what type all objects are
df.dtypes

Date     datetime64[ns]
Time              int64
Code              int64
Value             int64
dtype: object

In [14]:
df

Unnamed: 0,Date,Time,Code,Value
0,1991-04-21,9,58,100
1,1991-04-21,9,33,9
2,1991-04-21,9,34,13
3,1991-04-21,17,62,119
4,1991-04-21,17,33,7
...,...,...,...,...
336,1989-05-09,8,33,1
337,1989-05-09,8,34,7
338,1989-05-10,8,34,7
339,1989-05-11,8,34,7


In [15]:
# Remove all data that are unreal or doesn't exist
df = df[df.Value > 0] # High risk for hyperglycemia
df = df[df.Code != 36]
df = df[df.Code != 56]

# Now we will test our data with LinearRegression

In [16]:
# Creating the LinearRegression model
y = df['Value']
feature_columns = ['Time', 'Code']
X = df[feature_columns]

train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size = 0.3)
reg = LinearRegression()

In [17]:
# Train the model
reg.fit(train_X, train_Y)

LinearRegression()

In [18]:
# Calculating the R^2 from LinearRegression. Good for measure how well the data fit the model
reg.score(test_X, test_Y)

0.5874674624716041

In [19]:
y_pred = reg.predict(test_X)

In [20]:
# Accurancy score for the model. Good for measure how accurate the model is in classification. 
# PS DON'T USE IT FOR REGRESSION. I used it just for the learning
test_y_int = test_Y.astype(int)
y_pred_int = y_pred.astype(int)
print(accuracy_score(test_y_int,y_pred_int))

0.021778150053847076


In [21]:
# Calculating the MSE (Mean square error). Important to detect errors (the larger, the larger errors)
mse = mean_squared_error(test_Y, y_pred)
print(mse)

3661.2213032236386


In [22]:
# Calculating the MAE (Mean absolute error). Good for measure the average of all absolute errors (The larger, the larger error)
mae = mean_absolute_error(test_Y, y_pred)
print(mae)

37.70776406716732


In [23]:
# Calculating the RMSE (Root mean square error). Good for measure how well the data is around the line. 
rmse = sqrt(mse)
print(rmse)

60.50802676689795


In [24]:
df

Unnamed: 0,Date,Time,Code,Value
0,1991-04-21,9,58,100
1,1991-04-21,9,33,9
2,1991-04-21,9,34,13
3,1991-04-21,17,62,119
4,1991-04-21,17,33,7
...,...,...,...,...
336,1989-05-09,8,33,1
337,1989-05-09,8,34,7
338,1989-05-10,8,34,7
339,1989-05-11,8,34,7


# Now we will try the decision tree

In [25]:
#Creating decision tree model
y = df['Value']
feature_columns = ['Time', 'Code']
X = df[feature_columns]
train_X, test_X, train_Y, test_Y = train_test_split(X,y, test_size = 0.3)
tree = DecisionTreeRegressor()

In [26]:
# Train decision tree model
tree.fit(train_X, train_Y)

DecisionTreeRegressor()

In [27]:
# Do predict in the DecisionTree model
y_pred = tree.predict(test_X)

In [28]:
# Accurancy score for the model. Good for measure how accurate the model is in classification. 
# PS DON'T USE IT FOR REGRESSION. I used it just for the learning
test_y_int = test_Y.astype(int)
y_pred_int = y_pred.astype(int)
print(accuracy_score(test_y_int,y_pred_int)*100)

6.401818834509991


In [29]:
# Calculating the mean accurancy (<=> R2)
tree.score(test_X, test_Y)

0.6544112871860874

In [30]:
# Calculating the R2 value
r2 = r2_score(test_Y, y_pred)
print(r2)

0.6544112871860874


In [31]:
# Calculating the MSE (Mean square error). Important to detect errors (the larger, the larger errors)
mse = mean_squared_error(test_Y, y_pred)
print(mse)

2984.8669991377224


In [32]:
# Calculating the MAE (Mean absolute error). Good for measure the average of all absolute errors (The larger, the larger error)
mae = mean_absolute_error(test_Y, y_pred)
print(mae)

32.38770339372918


In [33]:
# Calculating the RMSE (Root mean square error). Good for measure how well the data is around the line (The larger, the larger error). 
rmse = sqrt(mse)
print(rmse)

54.63393633207956


In [34]:
df

Unnamed: 0,Date,Time,Code,Value
0,1991-04-21,9,58,100
1,1991-04-21,9,33,9
2,1991-04-21,9,34,13
3,1991-04-21,17,62,119
4,1991-04-21,17,33,7
...,...,...,...,...
336,1989-05-09,8,33,1
337,1989-05-09,8,34,7
338,1989-05-10,8,34,7
339,1989-05-11,8,34,7
