In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
import math

import warnings
warnings.filterwarnings('ignore')

In [None]:
# setting up default plotting parameters
%matplotlib inline

plt.rcParams['figure.figsize'] = [20.0, 7.0]
plt.rcParams.update({'font.size': 22,})

sns.set_palette('viridis')
sns.set_style('white')
sns.set_context('talk', font_scale=0.8)

In [None]:
# Read CSV into DataFrame
annualData = pd.read_csv("resources/annualData.csv")

# Drop the null rows
annualData = annualData.dropna()

# Display DataFrame
annualData

In [None]:
# Set features to be used as x values
features = annualData[["wiVMT", "wiGAS", "wiPOP", "wiDENS", "wiBicycle", "wiCarpool", "wiDrovealone",
                       "wiPublictransportation", "wiTaximotorcycleorother", "wiWalked", "wiWorkedathome",
                       "wiLaws", "evSHARE"]]
X = features
X.head()

In [None]:
# Set y values
y = annualData["wiGHG"].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

In [None]:
# function to get cross validation scores
def get_cv_scores(model):
    scores = cross_val_score(model,
                             X_train,
                             y_train,
                             cv=5,
                             scoring='r2')
    
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))
    print('\n')

In [None]:
from sklearn.linear_model import LinearRegression

# Train model
lr = LinearRegression().fit(X_train, y_train)

# get cross val scores
get_cv_scores(lr)

In [None]:
print('Train Score: ', lr.score(X_train, y_train))
print('Test Score: ', lr.score(X_test, y_test))

In [None]:
# coef_ attribute is numpy array with one entry per input feature
lr.coef_

In [None]:
# match column names to coefficients
for coef, col in enumerate(X_train.columns):
    print(f'{col}:  {lr.coef_[0]}')

In [None]:
# intercept_ always a single floating point number
lr.intercept_

In [None]:
# mean squared error
y_ = lr.predict(X_test)
lr_mse = mean_squared_error(y_, y_test)
lr_mse

In [None]:
# find distance from ground truth target value
math.sqrt(lr_mse)

In [None]:
print(y.min())
print(y.max())