In [None]:
# Dependencies and setup (will need more)
import numpy as np
import os
import pandas as pd
import warnings
warnings.simplefilter("ignore")

In [None]:
# Read CSV into DataFrame
annualData = pd.read_csv("resources/annualData.csv")

# Drop the null rows
annualData = annualData.dropna()

# Display DataFrame
annualData

In [None]:
# Print column names
for x in annualData.columns:
    print(x)

# Multiple Linear Regression (South Dakota)

In [None]:
# Set features to be used as x values
features = annualData[["sdVMT", "sdGAS", "sdPOP", "sdDENS", "sdBicycle", "sdCarpool", "sdDrovealone", 
                       "sdPublictransportation", "sdTaximotorcycleorother", "sdWalked", "sdWorkedathome",
                       "sdLaws", "evSHARE"]]
X = features
X.head()

### Create a Train-Test Split
Use sdGHG for the y values.

In [None]:
# Set y values
y = annualData["sdGHG"].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
print(y)

In [None]:
# Dependencies and setup
from sklearn.model_selection import train_test_split

# Use train-test split to create training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
X_train.head()

### Pre-Processing
Scale the data using the MinMaxScaler and perform some feature selection.

In [None]:
# Dependencies and setup
from sklearn.preprocessing import MinMaxScaler

# Scale data
X_scale = MinMaxScaler().fit(X_train)
X_train_scaled = X_scale.transform(X_train)
X_test_scaled = X_scale.transform(X_test)

### Train the Linear Regression Model

In [None]:
# Dependencies and setup
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# Fit the model to the training data and calculate scores for training and testing data
model.fit(X_train_scaled, y_train)
training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [None]:
# Display coefficient factors
print(model.coef_)

In [None]:
# Assign numpy array to variable
coefs = model.coef_

# Assign array values to variables
vmt = coefs[0][0]
fuel = coefs[0][1]
pop = coefs[0][2]
dens = coefs[0][3]
bike = coefs[0][4]
pool = coefs[0][5]
solo = coefs[0][6]
bus = coefs[0][7]
taxi = coefs[0][8]
walk = coefs[0][9]
wfh = coefs[0][10]
laws = coefs[0][11]
ev = coefs[0][12]

In [None]:
# Display results
print(f"Vehicle Miles Traveled: if VMT continues its trend, then annual GHG emissions changes by {vmt * 100:.2f}%.")
print(f"Fuel Consumption: if fuel consumption continues its trend, then annual GHG emissions changes by {fuel * 100:.2f}%.")
print(f"Population: if population continues its trend, then annual GHG emissions changes by {pop * 100:.2f}%.")
print(f"Population Density: if population density continues its trend, then annual GHG emissions changes by {dens * 100:.2f}%.")
print(f"Bicycles: if bicycle usage continues its trend, then annual GHG emissions changes by {bike * 100:.2f}%.")
print(f"Car Pools: if car pool ridership continues its trend, then annual GHG emissions changes by {pool * 100:.2f}%.")
print(f"Drive Alone: if solo drivers continues its trend, then annual GHG emissions changes by {solo * 100:.2f}%.")
print(f"Public Transportation: if mass transit ridership continues its trend, then annual GHG emissions changes by {bus * 100:.2f}%.")
print(f"Rideshare: if rideshare continues its trend, then annual GHG emissions changes by {taxi * 100:.2f}%.")
print(f"Walking: if walkers continues its trend, then annual GHG emissions changes by {walk * 100:.2f}%.")
print(f"Work From Home: if people working from home continues its trend, then annual GHG emissions changes by {wfh * 100:.2f}%.")
print(f"Public Initiatives: if lawmaking continues its trend, then annual GHG emissions changes by {laws * 100:.2f}%.")
print(f"EV Market Share: if EV market share continues its trend, then annual GHG emissions changes by {ev * 100:.2f}%.")

### Hyperparameter Tuning
Use GridSearchCV to tune the model's parameters.

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {"fit_intercept": [True, False],
              "normalize": [True, False],
              "copy_X": [True, False]}
grid = GridSearchCV(model, param_grid, verbose = 3)

In [None]:
# Fit the model using the grid search estimator
grid.fit(X_train_scaled, y_train)

In [None]:
# List the best parameters and best scores for this dataset
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
print(f"First 10 Predictions: {predictions[:10]}")

In [None]:
# R squared
model.score(X_test_scaled, y_test, sample_weight = None)

In [None]:
# Dependencies and setup
from sklearn.metrics import mean_squared_error, mean_absolute_error

# MSE
y_true = y_test
y_pred = predictions
mean_squared_error(y_true, y_pred)

In [None]:
# MAE
mean_absolute_error(y_true, y_pred)

In [None]:
# RMSE
np.sqrt(mean_squared_error(y_true, y_pred))