In [201]:
import numpy as np
import sklearn
from sklearn import neighbors
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error


Let's start by importing our data for the region & time split strategies.

In [202]:
hospPredictions = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (123))
features = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (range(123)))[:, 2:]
regions = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (0), dtype = 'str')
dates = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (1), dtype = 'datetime64')

With that out of the way, we can begin by splitting data based on the time. The selected time for the split is'2020-08-10'. Any dates after this will be in the test set and the rest of the data will be used for training. We will then test it with KNN/Decision trees.

In [203]:
# get the indices for train/test data
trainIndicesDate = np.argwhere(dates > np.datetime64("2020-08-10"))[:,0]
testIndicesDate = np.argwhere(dates <= np.datetime64("2020-08-10"))[:,0]

# prep the dataset
xTrain = np.take(features, trainIndicesDate, axis = 0)
xTest = np.take(features, testIndicesDate, axis = 0)
yTrain = np.take(hospPredictions, trainIndicesDate, axis = 0)
yTest = np.take(hospPredictions, testIndicesDate, axis = 0)

# KNN - This was used to determine the number of neighbours
#bestK = -1
#bestError = sys.maxsize
#for i in range(1, 36):
    #newError = mean_squared_error(yTest, predictionKnn)
    #if (newError < bestError):
        #bestError = newError
        #bestK = i

#print("best K is", bestK)
#print("error is", bestError)

# tested to be the best the optimal numebr of neighbors
knn = neighbors.KNeighborsRegressor(n_neighbors=34)
knn.fit(xTrain, yTrain)
predictionKnn = knn.predict(xTest)

# decision trees
# note: random state set to improve reproducibility 
clf = tree.DecisionTreeRegressor(random_state=0)
clf = clf.fit(xTrain, yTrain)
predictionTree = clf.predict(xTest)

# get the mean squared error in order to verify the results
print("MSE (KNN):          %7.1f" % (mean_squared_error(yTest, predictionKnn)))
print("MSE (DecisionTree): %7.1f" % (mean_squared_error(yTest, predictionTree)))

MSE (KNN):           3980.4
MSE (DecisionTree):  4622.9


We can now move onto a split by region.

Note that because the required split is such that an entire region belongs to a training or testing set, it is not possible to achieve an 80-20 split in our dataset. Instead, 75-25 is achieved. 

This is because our dataset consists of 640 instances - 40 datapoints per region for a total of 16 regions.
As a result, an 80-20 split results in a train-test split of 512-128 which means that a region will occur in both testing and training which is a contradiction of requirements. 

In [204]:
regionScoresKnn = []
regionScoresTree = []
 
#gss = GroupShuffleSplit(n_splits=5, train_size=0.8)
gkf = GroupKFold(n_splits=5)

for train_idx, test_idx in gkf.split(features, hospPredictions, regions):
    trainIndices = np.array(train_idx)
    testIndices = np.array(test_idx)

    xTrain = np.take(features, trainIndices, axis = 0)
    xTest = np.take(features, testIndices, axis = 0)
    yTrain = np.take(hospPredictions, trainIndices, axis = 0)
    yTest = np.take(hospPredictions, testIndices, axis = 0)

    # to determine the optimal # of neighbours here, the same logic was applied as before but this was averaged for each fold and then manually tuned
    knn = neighbors.KNeighborsRegressor(n_neighbors=44)
    knn.fit(xTrain, yTrain)
    predictionKnn = knn.predict(xTest)

    clf = tree.DecisionTreeRegressor(random_state=0)
    clf = clf.fit(xTrain, yTrain)
    predictionTree = clf.predict(xTest)

    regionScoresKnn.append(mean_squared_error(yTest, predictionKnn))
    regionScoresTree.append(mean_squared_error(yTest, predictionTree))

print("Mean MSE (KNN):          %7.1f" % (np.mean(regionScoresKnn)))
print("Mean MSE (DecisionTree): %7.1f" % (np.mean(regionScoresTree)))

Mean MSE (KNN):           3870.8
Mean MSE (DecisionTree):  5835.0


For additional testing, we will model each region separately.

Once again, we will use both KNNs and decision trees.

In [214]:
combined = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (range(124)), dtype = 'str') # load the entire dataset
combined = np.delete(combined, 1, 1) # remove the dates 
regionSplit = np.split(combined[:, :], np.cumsum(np.unique(combined[:, 0], return_counts=True)[1])[:-1]) # create a list of arrays that contains the features/predictions for a specific region

for regionalData in regionSplit:
    currentRegion = regionalData[0,0] # extract the current region we're predicting
    x = regionalData[:, 1:-1].astype(np.float) # features for the specific region
    y = regionalData[:,-1].astype(np.float) # predictions for each instance of that region

    # here, we will do a simple KNN (k=2) with an 80-20 split
    # this case is a bit more complex to select an optimal number of neighbours so it was manually tuned
    xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size = 0.8, random_state = 0)
    knn = neighbors.KNeighborsRegressor(n_neighbors=2)
    knn.fit(xTrain, yTrain)
    predictionKnn = knn.predict(xTest)

    clf = tree.DecisionTreeRegressor(random_state=0)
    clf = clf.fit(xTrain, yTrain)
    predictionTree = clf.predict(xTest)

    print("Region: %s | MSE (KNN): %7.1f | MSE (DecisionTree): %7.1f" % (currentRegion, mean_squared_error(yTest, predictionKnn), mean_squared_error(yTest, predictionTree)))

Region: US-AK | MSE (KNN):   139.6 | MSE (DecisionTree):   212.2
Region: US-DC | MSE (KNN):     0.0 | MSE (DecisionTree):     0.0
Region: US-DE | MSE (KNN):     0.0 | MSE (DecisionTree):     0.0
Region: US-HI | MSE (KNN):   131.1 | MSE (DecisionTree):   670.8
Region: US-ID | MSE (KNN):  2650.3 | MSE (DecisionTree):  1765.6
Region: US-ME | MSE (KNN):   107.6 | MSE (DecisionTree):   569.9
Region: US-MT | MSE (KNN):   194.2 | MSE (DecisionTree):   526.4
Region: US-ND | MSE (KNN):    28.9 | MSE (DecisionTree):   814.5
Region: US-NE | MSE (KNN): 85666.6 | MSE (DecisionTree): 80070.1
Region: US-NH | MSE (KNN):  1033.4 | MSE (DecisionTree):  1478.5
Region: US-NM | MSE (KNN):  8197.6 | MSE (DecisionTree):  9087.5
Region: US-RI | MSE (KNN):  4570.2 | MSE (DecisionTree):  6680.6
Region: US-SD | MSE (KNN):    47.8 | MSE (DecisionTree):    84.8
Region: US-VT | MSE (KNN):   126.6 | MSE (DecisionTree):   182.2
Region: US-WV | MSE (KNN):  3362.1 | MSE (DecisionTree):  6724.1
Region: US-WY | MSE (KNN)