# UHS Linear Regression

Let's fit a linear model to the data from the University Health Survey. We will use a 1D model to predict restfulness (quality of sleep) from food healthiness.

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import pickle

# Choose which variables to include in each analysis
input_variables = ["Food Healthiness"]

# Load preprocessed data
filename = '../data/data_combined_final.csv'
df = pd.read_csv(filename)
df_food = df[["Food Healthiness", "Restfulness"]]
# Drop rows with NaNs
df_food = df_food.dropna()

# Next fit the model
# Choose input and output data
x = df_food.drop(["Restfulness"], axis = 1)
y = df_food.drop(["Food Healthiness"], axis = 1)

# Split the data into train and test sets
test_size = 0.1
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size)
# Fit the model with train data
model_rest = LinearRegression().fit(x_train, y_train)
# Make predictions with test data
pred_rest = model_rest.predict(x_test)

Use RMS to measure the accuracy of the model.

In [14]:
def avg_root_mean_square(true, pred):
    true = np.array(true)
    pred = np.array(pred)
    assert len(true) == len(pred)
    n = len(true)
    rms = 0
    for i in range(n):
        rms += np.sqrt((true[i] - pred[i])**2)
    return rms / n

rms_deviation_rest = avg_root_mean_square(y_test, pred_rest)
print("Restfulness model RMS deviation:", rms_deviation_rest[0])
# Coefficienct of determination for the model
print("Restfulness model R^2:", model_rest.score(x_train, y_train))

Restfulness model RMS deviation: 0.5097614985251798
Restfulness model R^2: 1.6019262150113178e-05


Save with pickle.

In [15]:
with open('../data/models/uhs_model_rest.pkl', 'wb') as f:
    pickle.dump(model_rest, f)