Import required libraries

In [36]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

Load in excel files as Panda dataframes

In [37]:
os.chdir(r'C:\Users\hopem\OneDrive\Desktop\Spring 2020\Biomedical Health Informatics\Project\Data')
df_N = pd.read_excel('ProcessedData.xlsx', sheet_name = 'Norway')
df_A = pd.read_excel('ProcessedData.xlsx', sheet_name = 'Australia')

## Baseline Model

Test model just using current week climate and google data, not incorporate flu from past weeks

### Norway Baseline

Remove rows with empty values and split into features and total flu values

In [None]:
df_N_base = df_N.drop(df_N.index[range(0,8)])
n,m = df_N_base.shape
X_N_base = df_N_base.iloc[:,[1,7]]
y_N_base = df_N_base.iloc[:,4]

Split into test and train

In [None]:
X_train_N_base, X_test_N_base, y_train_N_base, y_test_N_base = train_test_split(X_N_base, y_N_base, train_size = 84, test_size = 20)

Find the OOB Error values of range of n_estimator values

In [None]:
# test multiple n_estimator performance
nums = [10, 100, 500, 1000, 2000, 3000, 4000, 5000, 10000]
oob_list_N_base = []
mse_list_N_base = []
for i in nums:
    rf_N_base_opt = RandomForestRegressor(n_estimators = i, oob_score = True)
    rf_N_base_opt.fit(X_train_N_base, y_train_N_base)
    oob_list_N_base.append(rf_N_base_opt.oob_score_)
    predictions_N_base = rf_N_base_opt.predict(X_test_N_base)
    errors_N_base = (predictions_N_base - y_test_N_base)**2
    MSE_N_base = np.sum(errors_N_base)/len(predictions_N_base)
    mse_list_N_base.append(MSE_N_base)

Find optimal n_estimator 

In [None]:
max_ind_N_base = np.argmax(oob_list_N_base)
print('Best n_estimator is:', nums[max_ind_N_base], 'with OOB Error:', oob_list_N_base[max_ind_N_base], 'and MSE:', mse_list_N_base[max_ind_N_base])

Train random forest model with optimal n_estimator to determine R^2 and MSE

In [None]:
rf_N_base = RandomForestRegressor(n_estimators = 3000, oob_score = True)
rf_N_base.fit(X_train_N_base, y_train_N_base)
predictions_N_base = rf_N_base.predict(X_test_N_base)
errors_N_base = (predictions_N_base - y_test_N_base)**2
MSE_N_base = np.sum(errors_N_base)/len(predictions_N_base)
print('Mean Sqaure Error:', MSE_N_base)

print('OOB Error Score is:', rf_N_base.oob_score_)

# find r squared
r_N_base = np.corrcoef(y_test_N_base,predictions_N_base)
corr_N_base = r_N_base[0,1]
r2_N_base = corr_N_base**2
print('R Squared is:', r2_N_base)

In [None]:
plt.title("Actual vs. Predicted Flu Count values for Norway Baseline")
plt.xlabel("Week")
plt.ylabel("Flu Counts Per Week")
plt.plot(range(len(y_test_N_base)),y_test_N_base, label = 'Actual')
plt.plot(range(len(predictions_N_base)),predictions_N_base, label = 'Predicted')
plt.legend()

### Australia Baseline

In [None]:
df_A_base = df_A.drop(df_A.index[range(0,8)])
n,m = df_A_base.shape
X_A_base = df_A_base.iloc[:,[1,7]]
y_A_base = df_A_base.iloc[:,4]

Split into test and train

In [None]:
X_train_A_base, X_test_A_base, y_train_A_base, y_test_A_base = train_test_split(X_A_base, y_A_base, train_size = 84, test_size = 20)

Find the OOB Error values of range of n_estimator values

In [None]:
# test multiple n_estimator performance
nums = [10, 100, 500, 1000, 2000, 3000, 4000, 5000, 10000]
oob_list_A_base = []
mse_list_A_base = []
for i in nums:
    rf_A_base_opt = RandomForestRegressor(n_estimators = i, oob_score = True)
    rf_A_base_opt.fit(X_train_A_base, y_train_A_base)
    oob_list_A_base.append(rf_A_base_opt.oob_score_)
    predictions_A_base = rf_A_base_opt.predict(X_test_A_base)
    errors_A_base = (predictions_A_base - y_test_A_base)**2
    MSE_A_base = np.sum(errors_A_base)/len(predictions_A_base)
    mse_list_A_base.append(MSE_A_base)

Find optimal n_estimator 

In [None]:
max_ind_A_base = np.argmax(oob_list_A_base)
print('Best n_estimator is:', nums[max_ind_A_base], 'with OOB Error:', oob_list_A_base[max_ind_A_base], 'and MSE:', mse_list_A_base[max_ind_A_base])

Train random forest model with optimal n_estimator to determine R^2 and MSE

In [None]:
rf_A_base = RandomForestRegressor(n_estimators = 2000, oob_score = True)
rf_A_base.fit(X_train_A_base, y_train_A_base)
predictions_A_base = rf_A_base.predict(X_test_A_base)
errors_A_base = (predictions_A_base - y_test_A_base)**2
MSE_A_base = np.sum(errors_A_base)/len(predictions_A_base)
print('Mean Sqaure Error:', MSE_A_base)

print('OOB Error Score is:', rf_A_base.oob_score_)

# find r squared
r_A_base = np.corrcoef(y_test_A_base,predictions_A_base)
corr_A_base = r_A_base[0,1]
r2_A_base = corr_A_base**2
print('R Squared is:', r2_A_base)

In [None]:
plt.title("Actual vs. Predicted Flu Count values for Australia Baseline")
plt.xlabel("Week")
plt.ylabel("Flu Counts Per Week")
plt.plot(range(len(y_test_A_base)),y_test_A_base, label = 'Actual')
plt.plot(range(len(predictions_A_base)),predictions_A_base, label = 'Predicted')
plt.legend()