In [38]:
import numpy as np
import sklearn
from sklearn import neighbors
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error


Let's start by importing our data for the region & time split strategies.

In [40]:
hospPredictions = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (123))
features = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (range(123)))[:, 2:]
regions = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (0), dtype = 'str')
dates = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (1), dtype = 'datetime64')

With that out of the way, we can begin by splitting data based on the time. The selected time for the split is'2020-08-10'. Any dates after this will be in the test set and the rest of the data will be used for training. We will then test it with KNN/Decision trees.

In [41]:
# get the indices for train/test data
trainIndicesDate = np.argwhere(dates > np.datetime64("2020-08-10"))[:,0]
testIndicesDate = np.argwhere(dates <= np.datetime64("2020-08-10"))[:,0]

# prep the dataset
xTrain = np.take(features, trainIndicesDate, axis = 0)
xTest = np.take(features, testIndicesDate, axis = 0)
yTrain = np.take(hospPredictions, trainIndicesDate, axis = 0)
yTest = np.take(hospPredictions, testIndicesDate, axis = 0)

# knn, neighbours set to 5 by default
knn = neighbors.KNeighborsRegressor()
knn.fit(xTrain, yTrain)
predictionKnn = knn.predict(xTest)

# decision trees
clf = tree.DecisionTreeRegressor()
clf = clf.fit(xTrain, yTrain)
predictionTree = clf.predict(xTest)

# get the mean squared error in order to verify the results
print("MSE (KNN):          %7.1f" % (mean_squared_error(yTest, predictionKnn)))
print("MSE (DecisionTree): %7.1f" % (mean_squared_error(yTest, predictionTree)))

MSE (KNN):           4248.7
MSE (DecisionTree):  3831.8


We can now move onto a split by region.

TBD

In [63]:
regionScoresKnn = []
regionScoresTree = []
 
#gss = GroupShuffleSplit(n_splits=5, train_size=0.8)
gkf = GroupKFold(n_splits=5)

for train_idx, test_idx in gkf.split(features, hospPredictions, regions):
    trainIndices = np.array(train_idx)
    testIndices = np.array(test_idx)

    xTrain = np.take(features, trainIndices, axis = 0)
    xTest = np.take(features, testIndices, axis = 0)
    yTrain = np.take(hospPredictions, trainIndices, axis = 0)
    yTest = np.take(hospPredictions, testIndices, axis = 0)

    knn = neighbors.KNeighborsRegressor(n_neighbors=4)
    knn.fit(xTrain, yTrain)
    predictionKnn = knn.predict(xTest)

    clf = tree.DecisionTreeRegressor()
    clf = clf.fit(xTrain, yTrain)
    predictionTree = clf.predict(xTest)

    regionScoresKnn.append(mean_squared_error(yTest, predictionKnn))
    regionScoresTree.append(mean_squared_error(yTest, predictionTree))

print("Mean MSE (KNN):          %7.1f" % (np.mean(regionScoresKnn)))
print("Mean MSE (DecisionTree): %7.1f" % (np.mean(regionScoresTree)))

Mean MSE (KNN):           4308.5
Mean MSE (DecisionTree):  5643.6


For additional testing, we will model each region separately.

Once again, we will use both KNNs and decision trees.

In [21]:
combined = np.genfromtxt('combined.csv', delimiter=',', skip_header=True, usecols = (range(124)), dtype = 'str') # load the entire dataset
combined = np.delete(combined, 1, 1) # remove the dates 
regionSplit = np.split(combined[:, :], np.cumsum(np.unique(combined[:, 0], return_counts=True)[1])[:-1]) # create a list of arrays that contains the features/predictions for a specific region

for regionalData in regionSplit:
    currentRegion = regionalData[0,0] # extract the current region we're predicting
    x = regionalData[:, 1:-1].astype(np.float) # features for the specific region
    y = regionalData[:,-1].astype(np.float) # predictions for each instance of that region

    # here, we will do a simple KNN (k=4) with an 80-20 split
    xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size = 0.8)
    knn = neighbors.KNeighborsRegressor(n_neighbors=4)
    knn.fit(xTrain, yTrain)
    predictionKnn = knn.predict(xTest)

    clf = tree.DecisionTreeRegressor()
    clf = clf.fit(xTrain, yTrain)
    predictionTree = clf.predict(xTest)

    print("Region: %s | MSE (KNN): %7.1f | MSE (DecisionTree): %7.1f" % (currentRegion, mean_squared_error(yTest, predictionKnn), mean_squared_error(yTest, predictionTree)))


Region: US-AK | MSE (KNN):    29.5 | MSE (DecisionTree):    36.0
Region: US-DC | MSE (KNN):     0.0 | MSE (DecisionTree):     0.0
Region: US-DE | MSE (KNN):     0.0 | MSE (DecisionTree):     0.0
Region: US-HI | MSE (KNN):   463.9 | MSE (DecisionTree):  2825.5
Region: US-ID | MSE (KNN):  2076.6 | MSE (DecisionTree):  1686.8
Region: US-ME | MSE (KNN):   153.2 | MSE (DecisionTree):   333.5
Region: US-MT | MSE (KNN):  1512.1 | MSE (DecisionTree):  1188.9
Region: US-ND | MSE (KNN):    86.4 | MSE (DecisionTree):  1041.0
Region: US-NE | MSE (KNN):  3328.6 | MSE (DecisionTree):  1942.9
Region: US-NH | MSE (KNN):   983.6 | MSE (DecisionTree):  1075.2
Region: US-NM | MSE (KNN):  6647.1 | MSE (DecisionTree): 11650.8
Region: US-RI | MSE (KNN):  1419.0 | MSE (DecisionTree):   847.5
Region: US-SD | MSE (KNN):   182.6 | MSE (DecisionTree):   844.5
Region: US-VT | MSE (KNN):    18.4 | MSE (DecisionTree):    81.0
Region: US-WV | MSE (KNN):  5447.9 | MSE (DecisionTree):  6724.0
Region: US-WY | MSE (KNN)