In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [None]:
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("mergedData.csv", index_col=0)

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.head()

In [None]:
# Set features. This will also be used as oour X values.

# economic data - 3 years rolling average GDP, Unemployment, Consumer Confidence, Median Income
selected_features1 = df.iloc[:,20:24]

# political data - current year presidential approval/disapproval rating
selected_features2 = df.iloc[:,49:51]

# merge economic and political features
selected_features = pd.merge(selected_features1, selected_features2, left_index=True, right_index=True)

selected_features.head()

In [None]:
from sklearn.model_selection import train_test_split

# train_test like numpy arrays, so transform dataframes into arrays
# reshape as needed by train_test
X = selected_features.to_numpy()
y = df["Box_Office_Adjusted"].to_numpy().reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Scale our data
from sklearn.preprocessing import StandardScaler

# create a scaler fit for training data
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

# scale the train and test data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LinearRegression

# use a linear regression model for our data
model = LinearRegression()

# fit training data for model
model.fit(X_train_scaled, y_train_scaled)

# create a residual plot of training and test data
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

In [None]:
# Display R^2 scores of training and test data
print(f"Training Data Score: {model.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test_scaled)}")

In [None]:
# save our model
import joblib
filename = 'box_office_model_final_project.sav'
joblib.dump(model, filename)