In [None]:
# Dependencies and setup
import numpy as np
import os
import pandas as pd
import warnings
import seaborn as sn
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")

In [None]:
# Read CSV into DataFrame
annualData = pd.read_csv("resources/annualAllStateData.csv")

# Drop the null rows
annualData = annualData.dropna()

# Display DataFrame
annualData

In [None]:
# Drop row with United States totals data
annualData = annualData.loc[annualData['State'] != 'United States']

# Feature Selection

In [None]:
# Set features to be used as x values
features = annualData.drop(columns=['Transportation (MtCO2e)','State', 'Year'])
X = features
X.head()

In [None]:
# Develop correlation matrix
X.corr()
corrMatrix = X.corr()

# Show result of correlation matrix
plt.figure(figsize=(25,25))

sn.heatmap(corrMatrix, annot=True)

plt.savefig("Resources/mnCorr.png")

plt.show()

In [None]:
# Re-set features to be used as x values, given above correlation
features = annualData.drop(columns=['Transportation (MtCO2e)','State', 'Highway use of gasoline (thousand gallons)', 'Vehicles', 'Year'])
X = features
X.head()

In [None]:
# Re-develop correlation matrix with new features
X.corr()
corrMatrix = X.corr()

# Show result of correlation matrix
plt.figure(figsize=(25,25))

sn.heatmap(corrMatrix, annot=True)

plt.savefig("Resources/mnCorr.png")

plt.show()

# Multiple Linear Regression Model

### Create a Train-Test Split
Use Transportation (MtCO2e) for the y values.

In [None]:
# Set y values
y = annualData['Transportation (MtCO2e)'].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
# Display values
print(y)

In [None]:
# Dependencies and setup
from sklearn.model_selection import train_test_split

# Use train-test split to create training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train.head()

### Pre-Processing
Scale the data using the StandardScaler and perform some feature selection.

In [None]:
# Dependencies and setup
from sklearn.preprocessing import StandardScaler

# Scale data
X_scale = StandardScaler().fit(X_train)
X_train_scaled = X_scale.transform(X_train)
X_test_scaled = X_scale.transform(X_test)

In [None]:
# Display array of scaled values
X_train_scaled

### Train the Multiple Linear Regression Model

In [None]:
# Dependencies and setup
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# Fit the model to the training data and calculate scores for training and testing data
model.fit(X_train_scaled, y_train)
training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

### Coefficients (Independent Values)

In [None]:
# Display coefficient factors
print(model.coef_)

In [None]:
# Display full amount (not rounded)
model.coef_[0][0]

In [None]:
# Display intercept
print(model.intercept_)

In [None]:
# Generate equation
print("The linear equation is: Y = {:.5} + {:.5}*GDP + {:.5}*Population + {:.5}*Transit + {:.5}*VMT + {:.5}*SQMI + {:.5}*Temp".format(model.intercept_[0], model.coef_[0][0], model.coef_[0][1], model.coef_[0][2], model.coef_[0][3], model.coef_[0][4], model.coef_[0][5]))

In [None]:
# Define function to predict GHG values
def make_prediction(GDP, population, transit, VMT, SQMI, temperature, coefs, intercept):

    GHG = coefs[0][0] * GDP + coefs[0][1] * population + coefs[0][2] * transit + coefs[0][3] * VMT + coefs[0][4] * SQMI \
          + coefs[0][5] * temperature + intercept[0]
        
    return GHG

In [None]:
# Display prediction
row = 0
GDP_param = X_test_scaled[row][0]
population_param = X_test_scaled[row][1]
transit_param = X_test_scaled[row][2]
VMT_param = X_test_scaled[row][3]
SQMI_param = X_test_scaled[row][4]
temperature_param = X_test_scaled[row][5]

make_prediction(GDP_param, population_param, transit_param, VMT_param, SQMI_param, temperature_param, model.coef_, model.intercept_)

### Model Accuracy Tests

In [None]:
# R squared
model.score(X_test_scaled, y_test, sample_weight = None)

In [None]:
# Make predictions
predictions = model.predict(X_test_scaled)
predictions[:10]

In [None]:
# Display y test array of values
y_test[:10]

In [None]:
# Dependencies and setup
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate and display Mean Square Error
y_true = y_test
y_pred = predictions
mean_squared_error(y_true, y_pred)

In [None]:
# Calculate and display Mean Absolute Error
mean_absolute_error(y_true, y_pred)

In [None]:
# Calculate and display Root Mean Square Error
np.sqrt(mean_squared_error(y_true, y_pred))

# Forecasting Selected Features

In [None]:
# Read CSV into DataFrame
annualData = pd.read_csv("resources/annualAllStateData.csv")

# Display DataFrame
annualData

### GDP

In [None]:
# Use pandas pivot function to shift annual values onto single row
GDP = annualData.pivot(index = "Year", columns = "State", values = "State GDP (Million US$ (chained 1997/2005))")

# Display DataFrame
GDP.head()

In [None]:
# Convert index to its own column
GDP["Year"] = GDP.index

# Display DataFrame
GDP.head()

In [None]:
# Create new DataFrame with only desired columns
gdpData = GDP[["Year", "United States"]].copy()

# Display DataFrame
gdpData.head()

In [None]:
# Reset index in place
gdpData.reset_index(inplace = True, drop = True)

# Display DataFrame
gdpData.head()

In [None]:
# Drop index label
gdpData.columns.name = None

# Display DataFrame
gdpData.head()

In [None]:
# Rename column
gdpData = gdpData.rename(columns = {"United States":"GDP"})

# Display DataFrame
gdpData

### Population

In [None]:
# Use pandas pivot function to shift annual values onto single row
pop = annualData.pivot(index = "Year", columns = "State", values = "Population (People)")

# Display DataFrame
pop.head()

In [None]:
# Convert index to its own column
pop["Year"] = pop.index

# Display DataFrame
pop.head()

In [None]:
# Create new DataFrame with only desired columns
popData = pop[["Year", "United States"]].copy()

# Display DataFrame
popData.head()

In [None]:
# Reset index in place
popData.reset_index(inplace = True, drop = True)

# Display DataFrame
popData.head()

In [None]:
# Drop index label
popData.columns.name = None

# Display DataFrame
popData.head()

In [None]:
# Rename column
popData = popData.rename(columns = {"United States":"Population"})

# Display DataFrame
popData

### Mass Transit Ridership

In [None]:
# Use pandas pivot function to shift annual values onto single row
bus = annualData.pivot(index = "Year", columns = "State", values = "Transit Ridership")

# Display DataFrame
bus.head()

In [None]:
# Convert index to its own column
bus["Year"] = bus.index

# Display DataFrame
bus.head()

In [None]:
# Create new DataFrame with only desired columns
busData = bus[["Year", "United States"]].copy()

# Display DataFrame
busData.head()

In [None]:
# Reset index in place
busData.reset_index(inplace = True, drop = True)

# Display DataFrame
busData.head()

In [None]:
# Drop index label
busData.columns.name = None

# Display DataFrame
busData.head()

In [None]:
# Rename column
busData = busData.rename(columns = {"United States":"transitRiders"})

# Display DataFrame
busData

### Vehicle Miles Traveled

In [None]:
# Use pandas pivot function to shift annual values onto single row
vmt = annualData.pivot(index = "Year", columns = "State", values = "Highway vehicle-miles traveled (millions)")

# Display DataFrame
vmt.head()

In [None]:
# Convert index to its own column
vmt["Year"] = vmt.index

# Display DataFrame
vmt.head()

In [None]:
# Create new DataFrame with only desired columns
vmtData = vmt[["Year", "United States"]].copy()

# Display DataFrame
vmtData.head()

In [None]:
# Reset index in place
vmtData.reset_index(inplace = True, drop = True)

# Display DataFrame
vmtData.head()

In [None]:
# Drop index label
vmtData.columns.name = None

# Display DataFrame
vmtData.head()

In [None]:
# Rename column
vmtData = vmtData.rename(columns = {"United States":"VMT"})

# Display DataFrame
vmtData

### Landmass Area

In [None]:
# Use pandas pivot function to shift annual values onto single row
sqmi = annualData.pivot(index = "Year", columns = "State", values = "sq mi")

# Display DataFrame
sqmi.head()

In [None]:
# Convert index to its own column
sqmi["Year"] = sqmi.index

# Display DataFrame
sqmi.head()

In [None]:
# Create new DataFrame with only desired columns
sqmiData = sqmi[["Year", "United States"]].copy()

# Display DataFrame
sqmiData.head()

In [None]:
# Reset index in place
sqmiData.reset_index(inplace = True, drop = True)

# Display DataFrame
sqmiData.head()

In [None]:
# Drop index label
sqmiData.columns.name = None

# Display DataFrame
sqmiData.head()

In [None]:
# Rename column
sqmiData = sqmiData.rename(columns = {"United States":"SQMI"})

# Display DataFrame
sqmiData

### Temperatures

In [None]:
# Use pandas pivot function to shift annual values onto single row
temps = annualData.pivot(index = "Year", columns = "State", values = "tempHigh")

# Display DataFrame
temps.head()

In [None]:
# Convert index to its own column
temps["Year"] = temps.index

# Display DataFrame
temps.head()

In [None]:
# Create new DataFrame with only desired columns
tempsData = temps[["Year", "United States"]].copy()

# Display DataFrame
tempsData.head()

In [None]:
# Reset index in place
tempsData.reset_index(inplace = True, drop = True)

# Display DataFrame
tempsData.head()

In [None]:
# Drop index label
tempsData.columns.name = None

# Display DataFrame
tempsData.head()

In [None]:
# Rename column
tempsData = tempsData.rename(columns = {"United States":"Temps"})

# Display DataFrame
tempsData

### GHG Emissions

In [None]:
# Use pandas pivot function to shift annual values onto single row
ghg = annualData.pivot(index = "Year", columns = "State", values = "Transportation (MtCO2e)")

# Display DataFrame
ghg.head()

In [None]:
# Convert index to its own column
ghg["Year"] = ghg.index

# Display DataFrame
ghg.head()

In [None]:
# Create new DataFrame with only desired columns
ghgData = ghg[["Year", "United States"]].copy()

# Display DataFrame
ghgData.head()

In [None]:
# Reset index in place
ghgData.reset_index(inplace = True, drop = True)

# Display DataFrame
ghgData.head()

In [None]:
# Drop index label
ghgData.columns.name = None

# Display DataFrame
ghgData.head()

In [None]:
# Rename column
ghgData = ghgData.rename(columns = {"United States":"Emissions"})

# Display DataFrame
ghgData

### Merge all DataFrames

In [None]:
# Merge DataFrames
trendData = pd.merge(gdpData, popData, how = "left", left_on = "Year", right_on = "Year")

# Display DataFrame
trendData.head()

In [None]:
# Merge DataFrames
trendData = trendData.merge(busData, how = "left", left_on = "Year", right_on = "Year")

# Display DataFrame
trendData.head()

In [None]:
# Merge DataFrames
trendData = trendData.merge(vmtData, how = "left", left_on = "Year", right_on = "Year")

# Display DataFrame
trendData.head()

In [None]:
# Merge DataFrames
trendData = trendData.merge(sqmiData, how = "left", left_on = "Year", right_on = "Year")

# Display DataFrame
trendData.head()

In [None]:
# Merge DataFrames
trendData = trendData.merge(tempsData, how = "left", left_on = "Year", right_on = "Year")

# Display DataFrame
trendData.head()

In [None]:
# Merge DataFrames
trendData = trendData.merge(ghgData, how = "left", left_on = "Year", right_on = "Year")

# Display DataFrame
trendData

In [None]:
# Rename column
trendData = trendData.rename(columns = {"Emissions":"actualGHG"})

# Display DataFrame
trendData

In [None]:
# Export to CSV
trendData.to_csv("resources/trendData.csv", index = False, header = True)

# Scaling Forecasted Selected Features (X Values)

In [None]:
# Read forecasted CSV into DataFrame
scaledData = pd.read_csv("resources/trendDataForecasts.csv")

# Display DataFrame
scaledData

In [None]:
# Declare X values
X = scaledData.drop(columns = "actualGHG")

# Display DataFrame
X.head()

In [None]:
# Set year as index
X.set_index("Year", inplace = True, drop = True)

# Display DataFrame
X.head()

In [None]:
# Dependencies and setup
from sklearn.preprocessing import StandardScaler

# Scale actual and forecasted values
X_scaled = X_scale.transform(X)

In [None]:
# Display array of scaled values
X_scaled

In [None]:
# Create DataFrame from array of scaled values
scaledValues = pd.DataFrame(data = X_scaled, columns = ["GDP", "Population", "Transit", "VMT", "SQMI", "Temps"])

# Display DataFrame
scaledValues.head()

In [None]:
# Create column of predicted GHG using regression equation from model
modelGHG = 38.875 + (-1.1205970612499048 * scaledValues["GDP"]) + (36.597160761709446 * scaledValues["Population"]) + (-6.6564653139231 * scaledValues["Transit"]) + (6.304683336038949 * scaledValues["VMT"]) + (4.465561144005419 * scaledValues["SQMI"]) + (3.1101727484010664 * scaledValues["Temps"])

# Add new columns
scaledValues["actualGHG"] = scaledData["actualGHG"]
scaledValues["Year"] = scaledData["Year"]
scaledValues["modelGHG"] = modelGHG

# Display DataFrame
scaledValues.head()

In [None]:
# Reorganize columns
scaledValues = scaledValues[["Year", "GDP", "Population", "Transit", "VMT", "SQMI", "Temps", "actualGHG", "modelGHG"]]

# Display DataFrame
scaledValues

In [None]:
# Create new DataFrame with desired columns
scaledFinal = scaledValues[["Year", "modelGHG"]].copy()

# Display DataFrame
scaledFinal.head()

In [None]:
# Create new DataFrame with desired columns
originalDF = trendData[["Year", "GDP", "Population", "transitRiders", "VMT", "SQMI", "Temps", "actualGHG"]].copy()

# Display DataFrame
originalDF.head()

In [None]:
# Merge DataFrames
scaledFinal = scaledFinal.merge(originalDF, how = "left", left_on = "Year", right_on = "Year")

# Display DataFrame
scaledFinal.head()

In [None]:
# Reorganize columns
scaledFinal = scaledFinal[["Year", "GDP", "Population", "transitRiders", "VMT", "SQMI", "Temps", "actualGHG", "modelGHG"]]

# Display DataFrame
scaledFinal

In [None]:
# Rename column
scaledFinal = scaledFinal.rename(columns = {"transitRiders":"Transit"})

# Display DataFrame
scaledFinal

In [None]:
# Export to CSV
scaledFinal.to_csv("resources/scaledFinal.csv", index = False, header = True)