In [1]:
import numpy as np
import pandas as pd
import random
import copy
import math
import matplotlib.pyplot as plt

In [2]:
mergedDatasetPath = "mergedDataset.csv"
mergedDataset = pd.read_csv(mergedDatasetPath, sep=',', header=None)
mergedDatasetData = mergedDataset.values

In [3]:
copyMergedDatasetData = copy.deepcopy(mergedDatasetData)

copyMergedDatasetData = np.delete(copyMergedDatasetData, 0, 0)
garbage, regionIndex = np.unique(copyMergedDatasetData.T[0], return_index=True)

regionSize = regionIndex.size

## KNN
### Region based split

In [4]:
numCol = copyMergedDatasetData.shape[1]

trainSize = regionIndex[np.floor(regionSize*0.8).astype(int)]
testSize = (copyMergedDatasetData.shape[0]-trainSize)

regionBucketSize= copyMergedDatasetData.shape[0]/regionSize
divider = int(testSize/regionBucketSize)

trainSetRegionSplits = np.zeros(6, dtype = object)
testSetRegionSplits = np.zeros(6, dtype = object)

trainSetTimeSplits = []
testSetTimeSplits = []

for index in range(6):
    tempTrainSets = []
    tempTestSets = [] 
    
    tempTestSets = copyMergedDatasetData[index*int(testSize/divider):(index+divider)*int(testSize/divider)].tolist()
    
    tempTrainSets = (copyMergedDatasetData).tolist()
    for row in tempTestSets:
        for obj in tempTrainSets:
            if row[0] == obj[0] and row[1] == obj[1]:
                tempTrainSets.remove(obj)
            
    trainSetRegionSplits[index] = np.array(tempTrainSets)
    testSetRegionSplits[index] = np.array(tempTestSets)

for i,row in enumerate(copyMergedDatasetData):
    month = int(row[1][5])*10+int(row[1][6])
    day = int(row[1][8])*10+int(row[1][9])
    if month>= 8 and day>10:
        testSetTimeSplits.append(row)        
    else:
        trainSetTimeSplits.append(row)
        
trainSetTimeSplits = np.array(trainSetTimeSplits)
testSetTimeSplits = np.array(testSetTimeSplits)

trainSizeRegions = trainSetRegionSplits[0].shape[0]
testSizeRegions = testSetRegionSplits[0].shape[0]
trainSizeDates = trainSetTimeSplits.shape[0]
testSizeDates = testSetTimeSplits.shape[0]

In [5]:
euclidean = lambda x1, x2: np.sqrt(np.sum((x1 - x2)**2, axis=-1))
manhattan = lambda x1, x2: np.sum(np.abs(x1 - x2), axis=-1)

class KNN:

    def __init__(self, K=1, dist_fn= euclidean):
        self.dist_fn = dist_fn
        self.K = K
        return
    
    def fit(self, x, y): 
        xList = []
        yList = []
        for i, row in enumerate(x):
            if(not isNaN(row[0]) and not isNaN(row[1])):
                xList.append([row[0],row[1]])
                yList.append(y[i])
        self.x = np.asarray(xList)
        self.y = np.asarray(yList)
        return self
    

    
    def predict(self, x_test):
        num_test = x_test.shape[0]
        #print(self.x, x_test)
        distances = self.dist_fn(self.x[None,:,:], x_test[:,None,:])
        knns = np.zeros((num_test, self.K), dtype=int)
        
        for i in range(num_test):
            knns[i,:] = np.argsort(distances[i])[:self.K]
        
        y_mean = np.zeros(num_test)
        
        for i, knn in enumerate(knns):
            for k in knn:
                y_mean[i] += self.y[k]
            y_mean[i] /= self.K
        
        return y_mean, knns

In [6]:
def isNaN(num):
    return num != num

In [7]:
# cross-validation for regions (two symptoms/features)
def completeRegionKNN(symptom1,symptom2, printStuff):
    
    costs = np.zeros(6)
    
    firstSymptomIndex = symptom1
    secondSymptomIndex = symptom2

    colHeaders = mergedDatasetData[0]

    x_train, y_train = np.zeros([trainSizeRegions, 24]), np.zeros(trainSizeRegions)
    x_test, y_test = np.zeros([testSizeRegions, 24]), np.zeros(testSizeRegions)

    for index in range(trainSetRegionSplits.size):
        trainingRegions = []
        testRegions = []

        for i, row in enumerate(trainSetRegionSplits[index]):
            if row[0] not in trainingRegions:
                trainingRegions.append(row[0])
            for j, x in enumerate(row[2:26]):
                x = x.astype(float)
                x_train[i][j] = x

            y_train[i] = 0 if isNaN(row[26]) else row[26]



        for i, row in enumerate(testSetRegionSplits[index]):
            if row[0] not in testRegions:
                testRegions.append(row[0])
            for j, x in enumerate(row[2:26]):
                x = x.astype(float)
                x_test[i][j] = x
            y_test[i] = 0 if isNaN(row[26]) else row[26]

        # plot training set and test set
        if(printStuff):
            plt.scatter(x_train[:,firstSymptomIndex], x_train[:,secondSymptomIndex], marker='o', label='train')
            plt.scatter(x_test[:,firstSymptomIndex], x_test[:,secondSymptomIndex], marker='s', label='test')
            plt.legend()
            plt.ylabel(colHeaders[2 + secondSymptomIndex])
            plt.xlabel(colHeaders[2 + firstSymptomIndex])
            plt.show()

        # predict hospitalizations and calculate cost

        model = KNN(K=3)

        xTestList = []
        yTestList = []

        for i, row in enumerate(x_test):
            if(not isNaN(row[firstSymptomIndex]) and not isNaN(row[secondSymptomIndex])):
                xTestList.append([row[firstSymptomIndex],row[secondSymptomIndex]])
                yTestList.append(y_test[i])

        xTestFiltered = np.asarray(xTestList)
        yTestFiltered = np.asarray(yTestList)

        #print(np.array([x_train[:,firstSymptomIndex],x_train[:,secondSymptomIndex]]).T)

        try:


            y_mean, knns = model.fit(np.array([x_train[:,firstSymptomIndex],x_train[:,secondSymptomIndex]]).T, y_train).predict(xTestFiltered)
            total = 0
            for i, y_predicted in enumerate(y_mean):
                total += np.square(yTestFiltered[i] - y_predicted)
            cost = total/y_mean.size
            costs[index]=cost
            if(printStuff):
                print('training regions:', trainingRegions)
                print('test regions:', testRegions)
                print('cost:', int(cost))
        except:
            if(printStuff):
                print("An exception occurred: cannot perform KNN with an empty training set or test set. Try again with a different combination of symptoms.")
            costs[index] = np.inf
            
    return costs
    
            
            

In [8]:
symptomsAmont = numCol-3
costArrayRegionsKNN = np.zeros([symptomsAmont,symptomsAmont,6])
for i in range(symptomsAmont):
    for j in range(symptomsAmont):
        if j!=i: 
            costArrayRegionsKNN[i][j] = completeRegionKNN(i,j, False)

In [9]:
for i in range(6):
    df = pd.DataFrame(costArrayRegionsKNN[:,:,i])
    df.to_csv(f"KNNRegionsCostFold{i+1}.csv", index=False, header=False)

### Time-split

In [10]:
# cross-validation for time (two symptoms)
def completeDateKNN(symptom1,symptom2, printStuff):
    
    firstSymptomIndex = symptom1
    secondSymptomIndex = symptom2

    colHeaders = mergedDatasetData[0]

    x_train, y_train = np.zeros([trainSizeDates, 24]), np.zeros(trainSizeDates)
    x_test, y_test = np.zeros([testSizeDates, 24]), np.zeros(testSizeDates)

    for i, row in enumerate(trainSetTimeSplits):
        for j, x in enumerate(row[2:26]):
            x_train[i][j] = x
        y_train[i] = 0 if isNaN(row[26]) else row[26]

    for i, row in enumerate(testSetTimeSplits):
        for j, x in enumerate(row[2:26]):
            x_test[i][j] = x
        y_test[i] = 0 if isNaN(row[26]) else row[26]

    # plot training set and test set
    if(printStuff):
        plt.scatter(x_train[:,firstSymptomIndex], x_train[:,secondSymptomIndex], marker='o', label='train')
        plt.scatter(x_test[:,firstSymptomIndex], x_test[:,secondSymptomIndex], marker='s', label='test')
        plt.legend()
        plt.ylabel(colHeaders[2 + secondSymptomIndex])
        plt.xlabel(colHeaders[2 + firstSymptomIndex])
        plt.show()

    # predict hospitalizations and calculate cost

    model = KNN(K=3)

    xTestList = []
    yTestList = []

    for i, row in enumerate(x_test):
        if(not isNaN(row[firstSymptomIndex]) and not isNaN(row[secondSymptomIndex])):
            xTestList.append([row[firstSymptomIndex],row[secondSymptomIndex]])
            yTestList.append(y_test[i])

    xTestFiltered = np.asarray(xTestList)
    yTestFiltered = np.asarray(yTestList)

    y_mean, knns = model.fit(np.array([x_train[:,firstSymptomIndex],x_train[:,secondSymptomIndex]]).T, y_train).predict(xTestFiltered)

    total = 0
    for i, y_predicted in enumerate(y_mean):
        total += np.square(yTestFiltered[i] - y_predicted)
    cost = total/y_mean.size
    
    return cost
    if(printStuff):
        print('cost:', int(cost))

In [11]:
costArrayDateKNN = np.zeros([symptomsAmont,symptomsAmont])
for i in range(symptomsAmont):
    for j in range(symptomsAmont):
        if j!=i: 
            costArrayDateKNN[i][j] = completeDateKNN(i,j, False)

In [12]:
dm = pd.DataFrame(costArrayDateKNN[:,:])
dm.to_csv('KNNDateCost.csv', index=False, header=False)

## Regression Trees
### Region-based split

In [13]:
class Node:
    def __init__(self, data_indices,outcome, parent):
        self.data_indices = data_indices                    
        self.left = None                                    
        self.right = None                                   
        self.split_feature = None                           
        self.split_value = None                             
        if parent:
            self.depth = parent.depth + 1                   
            self.data = parent.data                         
            self.labels = parent.labels                     
            
            self.class_prob = outcome  


In [14]:
def greedy_test(node, cost_fn):
    best_cost = np.inf
    best_feature, best_value = None, None
    num_instances, num_features = node.data.shape

    data_sorted = np.sort(node.data[node.data_indices],axis=0)
    

    test_candidates = (data_sorted[1:] + data_sorted[:-1]) / 2.
    
    
    for f in range(num_features):
        data_f = node.data[node.data_indices, f]
        for test in test_candidates[:,f]:
            
            left_indices = node.data_indices[data_f <= test]
            right_indices = node.data_indices[data_f > test]
            if len(left_indices) == 0 or len(right_indices) == 0:                
                continue
            
            left_cost = cost_fn(node.labels[left_indices])
            right_cost = cost_fn(node.labels[right_indices])
            num_left, num_right = left_indices.shape[0], right_indices.shape[0]
            cost = (num_left * left_cost + num_right * right_cost)/num_instances
            if cost < best_cost:
                best_cost = cost
                best_feature = f
                best_value = test
    return best_cost, best_feature, best_value

In [15]:
def cost_misclassification(labels):
    avg = np.average(labels)
    total = 0
    for label in labels:
        total = total + (label-avg)**2
    cost = total/len(labels)
    return cost

def cost_entropy(labels):
    class_probs = np.bincount(int(labels)) / len(labels)
    class_probs = class_probs[class_probs > 0]              
    return -np.sum(class_probs * np.log(class_probs))       

def cost_gini_index(labels):
    class_probs = np.bincount(labels) / len(labels)
    return 1 - np.sum(np.square(class_probs))     

In [16]:
class DecisionTree:
    def __init__(self, num_classes=None, max_depth=3, cost_fn=cost_misclassification, min_leaf_instances=1):
        self.max_depth = max_depth      
        self.root = None                
        self.cost_fn = cost_fn           
        self.min_leaf_instances = min_leaf_instances  
        
    def fit(self, data, labels):
        pass                            
    
    def predict(self, data_test):
        pass

In [17]:
def fit(self, data, labels):
    
    
    dataKeep = []
    hospKeep = []
    for i, row in enumerate(data):
        isNotNan = True
        for lookUp in row:
            isNotNan = (not isNaN(lookUp)) and (isNotNan)
        if(isNotNan):
            dataKeep.append(row)
            hospKeep.append(labels[i])
    
    self.data = np.asarray(dataKeep)
    self.labels = np.asarray(hospKeep)
        

    self.root = Node(np.arange(self.data.shape[0]),None, None)
    self.root.data = self.data
    self.root.labels = self.labels
    self.root.depth = 0
    self._fit_tree(self.root)
    return self

def _fit_tree(self, node):

    
    if node.depth == self.max_depth or len(node.data_indices) <= self.min_leaf_instances:
        return

    cost, split_feature, split_value = greedy_test(node, self.cost_fn)
    
    if np.isinf(cost):
        return
    
    test = node.data[node.data_indices,split_feature] <= split_value

    node.split_feature = split_feature
    node.split_value = split_value
        
    leftHospitalTotal = np.sum(node.labels[node.data_indices[test]])

    leftHospitalAverage = leftHospitalTotal/node.data_indices[test].shape[0]
    
    rightHospitalTotal = np.sum(node.labels[node.data_indices[np.logical_not(test)]])
    rightHospitalAverage = rightHospitalTotal/node.data_indices[np.logical_not(test)].shape[0]

    left = Node(node.data_indices[test],leftHospitalAverage, node)
    right = Node(node.data_indices[np.logical_not(test)],rightHospitalAverage, node)
    

    self._fit_tree(left)
    self._fit_tree(right)

    node.left = left
    node.right = right

DecisionTree.fit = fit
DecisionTree._fit_tree = _fit_tree

In [18]:
def predict(self, data_test):

    class_probs = np.zeros(data_test.shape[0])

    for n, x in enumerate(data_test):
        node = self.root
        while node.left:
            if x[node.split_feature] <= node.split_value:
                node = node.left
            else:
                node = node.right
        class_probs[n] = node.class_prob
    return class_probs

DecisionTree.predict = predict

In [19]:
def makeTree(x_train, y_train, xTestFiltered, depth, symptom1, symptom2):
    tree = DecisionTree(max_depth=depth, cost_fn=cost_misclassification)
    probs_test = tree.fit(np.array([x_train[:,symptom1],x_train[:,symptom2]]).T, y_train).predict(xTestFiltered)
    return probs_test

In [20]:
def completeRegionsTree(symptom1,symptom2, printStuff):

    x_train, y_train = np.zeros([trainSizeRegions, 24]), np.zeros(trainSizeRegions)
    x_test, y_test = np.zeros([testSizeRegions, 24]), np.zeros(testSizeRegions)
    
    costs = np.zeros(6)


    firstSymptomIndex = symptom1
    secondSymptomIndex = symptom2

    for index in range(trainSetRegionSplits.shape[0]):
        trainingRegions = []
        testRegions = []

        for i, row in enumerate(trainSetRegionSplits[index]):
            if row[0] not in trainingRegions:
                trainingRegions.append(row[0])
            for j, x in enumerate(row[2:26]):
                x = x.astype(float)
                x_train[i][j] = x
            y_train[i] = 0 if isNaN(row[26]) else row[26]

        for i, row in enumerate(testSetRegionSplits[index]):
            if row[0] not in testRegions:
                testRegions.append(row[0])
            for j, x in enumerate(row[2:26]):
                x = x.astype(float)
                x_test[i][j] = x
            y_test[i] = 0 if isNaN(row[26]) else row[26]


        xTestList = []
        yTestList = []

        for i, row in enumerate(x_test):
            if(not isNaN(row[firstSymptomIndex]) and not isNaN(row[secondSymptomIndex])):
                xTestList.append([row[firstSymptomIndex],row[secondSymptomIndex]])
                yTestList.append(y_test[i])

        xTestFiltered = np.asarray(xTestList)
        yTestFiltered = np.asarray(yTestList)

        try:
            probs_test = makeTree(x_train, y_train, xTestFiltered, 4, symptom1, symptom2)

            total = 0
            for i, y_predicted in enumerate(probs_test):
                total += np.square(yTestFiltered[i] - y_predicted)
            cost = total/probs_test.shape[0]
            costs[index] = int(cost)
            if(printStuff):
                print('symptom 1:', colHeaders[2 + firstSymptomIndex])
                print('symptom 2:', colHeaders[2 + secondSymptomIndex])
                print('train regions:', trainingRegions)
                print('test regions:', testRegions)
                print("cost:",int(cost))
                print("\n")
        except:
            if(printStuff):
                print('symptom 1:', colHeaders[2 + firstSymptomIndex])
                print('symptom 2:', colHeaders[2 + secondSymptomIndex])
                print('train regions:', trainingRegions)
                print('test regions:', testRegions)
                print("An exception occurred: cannot perform regression trees with an empty training set or test set. Try again with a different combination of symptoms.")
                print("\n")
            costs[index] = np.inf
       
    return costs

In [21]:
##only run this if you want to wait 10 minutes

symptomsAmont = numCol-3
costArrayRegionsTree = np.zeros([symptomsAmont,symptomsAmont,6])
for i in range(symptomsAmont):
    for j in range(symptomsAmont):
        if j!=i: 
            costArrayRegionsTree[i][j] = completeRegionsTree(i,j, False)

In [22]:

for i in range(6):
    df = pd.DataFrame(costArrayRegionsTree[:,:,i])
    df.to_csv(f"regressionTreeRegionsCostFold{i+1}.csv", index=False, header=False)

### Time-based split

In [23]:
def completeDateTree(symptom1,symptom2, printStuff):
    
    firstSymptomIndex = symptom1
    secondSymptomIndex = symptom2

    x_train, y_train = np.zeros([trainSizeDates, 24]), np.zeros(trainSizeDates)
    x_test, y_test = np.zeros([testSizeDates, 24]), np.zeros(testSizeDates)

    for i, row in enumerate(trainSetTimeSplits):
        for j, x in enumerate(row[2:26]):
            x_train[i][j] = x
        y_train[i] = 0 if isNaN(row[26]) else row[26]

    for i, row in enumerate(testSetTimeSplits):
        for j, x in enumerate(row[2:26]):
            x_test[i][j] = x
        y_test[i] = 0 if isNaN(row[26]) else row[26]

    xTestList = []
    yTestList = []

    for i, row in enumerate(x_test):
        if(not isNaN(row[firstSymptomIndex]) and not isNaN(row[secondSymptomIndex])):
            xTestList.append([row[firstSymptomIndex],row[secondSymptomIndex]])
            yTestList.append(y_test[i])

    xTestFiltered = np.asarray(xTestList)
    yTestFiltered = np.asarray(yTestList)
    
    try:
        probs_test = makeTree(x_train, y_train, xTestFiltered, 4, symptom1, symptom2)

        total = 0
        for i, y_predicted in enumerate(probs_test):
            total += np.square(yTestFiltered[i] - y_predicted)
        cost = total/probs_test.shape[0]

        if(printStuff):
            print("cost:",cost)
        return int(cost)
    except:
        if(printStuff):
            print("An exception occurred: cannot perform regression trees with an empty training set or test set. Try again with a different combination of symptoms.")
            return np.inf
    

In [24]:
##only run this if you want to wait 45 seconds
symptomsAmont = numCol-3
costArrayDateTree = np.zeros([symptomsAmont,symptomsAmont])
for i in range(symptomsAmont):
    for j in range(symptomsAmont):
        if j!=i: 
            costArrayDateTree[i][j] = completeDateTree(i,j, False)

In [25]:
dm = pd.DataFrame(costArrayDateTree[:,:])
dm.to_csv('regressionTreeDateCost.csv', index=False, header=False)