In [1]:
# 5-fold cross validation on linear regression model of COVID data
# Justin Heyer

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [14]:
# Reading source data
covid_df = pd.read_excel('covid_testing_data.xlsx').set_index("COUNTRY")

# Coverting column of percentages to column of decimals
covid_df['PERSONS OVER AGE 65 IN POP.'] = covid_df['PERSONS OVER AGE 65 IN POP.'].str.rstrip('%').astype('float') / 100.0
covid_df.head()

Unnamed: 0_level_0,DEATHS/100K POP.,PERSONS OVER AGE 65 IN POP.,"HOSPITAL BEDS/1,000 PEOPLE"
COUNTRY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,20.04,0.0268,0.4
Albania,124.71,0.1303,2.9
Algeria,15.69,0.0617,1.9
Andorra,200.61,0.19475,2.5
Angola,5.83,0.023,0.8


In [9]:
# Defining the model
linear_regression = LinearRegression()
linear_regression

LinearRegression()

In [10]:
# Defining features to be used
X = covid_df[['PERSONS OVER AGE 65 IN POP.', 'HOSPITAL BEDS/1,000 PEOPLE']]
y = covid_df['DEATHS/100K POP.']

In [11]:
# Defining cross-validation method to be used
cv = KFold(n_splits=5, random_state=1, shuffle=True)
cv.get_n_splits(X)
cv

KFold(n_splits=5, random_state=1, shuffle=True)

In [7]:
# Function to get R^2 scores
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [13]:
# 5-fold cross validation
scores = []
for train_index, test_index in cv.split(X):
    X_train = covid_df.iloc[train_index, 1:3]
    X_test = covid_df.iloc[test_index, 1:3]
    y_train = covid_df.iloc[train_index, 0:1]
    y_test = covid_df.iloc[test_index, 0:1]
    scores.append(get_score(linear_regression, X_train, X_test, y_train, y_test))
    
rounded_scores = [f"{num:.3f}" for num in scores]
print("5-fold cross validation accuracy scores: " + str(rounded_scores))
print("Cross validation accuracy: " + str(round(np.mean(scores), 3)))

5-fold cross validation accuracy scores: ['0.551', '0.184', '0.210', '0.518', '0.102']
Cross validation accuracy: 0.313
