In [24]:
import pandas as pd
import torch
import numpy as np
from matplotlib import pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (15, 8)

### Basic Evaluation Functions

In [2]:
def r2_score(testY, predictedY):
    testYMean = testY.mean()
    SStot = sum((testY - testYMean)**2)
    SSreg = sum((testY - predictedY)**2)
    improvement = SStot - SSreg
    return (improvement / SStot).item()

### Loading in Data

In [3]:
train = pd.read_csv("train.csv") # reading in train data
test = pd.read_csv("test.csv") # reading in test data 
data = pd.concat([train, test])
trainY = torch.tensor(train[["DTH_CUM_CP"]].values, dtype=torch.float64)
trainX = torch.tensor(train.loc[:, ~train.columns.isin(["DTH_CUM_CP"])].values.astype(np.float32), dtype=torch.float64)
testY = torch.tensor(test[["DTH_CUM_CP"]].values, dtype=torch.float64)
testX = torch.tensor(test.loc[:, ~test.columns.isin(["DTH_CUM_CP"])].values, dtype=torch.float64)
print("Length of Training Set: ", len(trainX))
print("Length of Test Set: ", len(testX))

Length of Training Set:  1044
Length of Test Set:  348


## Exploring Various Machine Learning Models

### Hardcoded Model

In [4]:
if torch.cuda.is_available():
    print("Now running on GPU")
    testX = testX.to("cuda")
    testY = testY.to("cuda")
    trainX = trainX.to("cuda")
    trainY = trainY.to("cuda")
    
coef = torch.tensor([   # The assumption is that death rate of covid is higher for people of older age
        [0.0040], # the death rate in age group 1 is 0.4%
        [0.0040],
        [0.0040],
        [0.0040],
        [0.0040],
        [0.0040], # POS_50_59_CP
        [0.0300], # POS_60_69_CP
        [0.0300],
        [0.0300],
        [0.0300]
], dtype=testX.dtype)

resultOne = testX[0] @ coef # predicted number of deaths in the first census tract  
predictedY = testX @ coef
print("R2 Score:", round(r2_score(testY, predictedY), 3))
print("MSE:", round(mean_squared_error(testY, predictedY), 3))

R2 Score: 0.411
MSE: 39.71


### Linear Regression Model

In [5]:
trainX = trainX.type(torch.float32)
trainY = trainY.type(torch.float32)
testX = testX.type(torch.float32)
testY = testY.type(torch.float32)

ds = torch.utils.data.TensorDataset(trainX, trainY) # tensor dataset
dl = torch.utils.data.DataLoader(ds, batch_size = 40, shuffle=True) # tensor dataloader
model = torch.nn.Linear(10, 1) # 10 age groups -> 10 in features, 1 out feature
optimizer = torch.optim.SGD([model.weight, model.bias], lr=0.000001)
loss_function = torch.nn.MSELoss()

start = time.time()
for epoch in range(200):
    for batchX, batchY in dl:
        predY = model(batchX) # Use the model to make predictions for this batch.
        loss = loss_function(batchY, predY) # Calculate the loss between the true values and the predictions.
        loss.backward() # Backpropagate the gradients. This calculates the gradient of the loss with respect to each parameter.
        optimizer.step() # Update the parameters. This applies the calculated gradients to the parameters, improving the model.
        optimizer.zero_grad() # Zero the gradients. This is necessary because PyTorch accumulates gradients, so they need to be reset each time.

end = time.time()
print("Training took", end-start, "seconds")
pred = model(testX)
print("R2 Score:", round(r2_score(testY, pred), 3))
print("MSE:", round(mean_squared_error(testY.tolist(), pred.tolist()), 3))

Training took 1.8601210117340088 seconds
R2 Score: 0.525
MSE: 32.012


### Decision Tree

In [23]:
X = data.drop('DTH_CUM_CP', axis=1)
Y = data['DTH_CUM_CP']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=32)
model_tree = DecisionTreeRegressor(max_depth=5)
model_tree.fit(X_train, Y_train)
pred = model_tree.predict(X_test)
print("R2 Score:", round(model_tree.score(X_test, Y_test), 3))
print("MSE:", round(mean_squared_error(Y_test, pred), 3))

R2 Score: 0.444
MSE: 41.909


### Random Forest

In [51]:
model_forest = RandomForestRegressor(n_estimators=100,max_depth=10)
model_forest.fit(X_train, Y_train)
pred = model_forest.predict(X_test)
print("R2 Score:", round(model_forest.score(X_test, Y_test), 3))
print("MSE:", round(mean_squared_error(Y_test, pred), 3))

R2 Score: 0.583
MSE: 31.387


2