In [1]:
#standard libraries
import pandas as pd
import numpy as np

#need this to get data from my googlesheet
from io import StringIO
import matplotlib.pyplot as plt
import requests

#sklearn library is large
#notice how the import is specifying class and funciton
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Construct the export URL
url = 'https://docs.google.com/spreadsheets/d/1p0nIAMW_-cVO4_5AQJ0HQuKd8JAYD-bUnauusrm95Bk/export?format=csv&gid=1808035861'

# Fetch the CSV data
response = requests.get(url)
data = response.content

# Read the CSV data into a DataFrame
df = pd.read_csv(StringIO(data.decode('utf-8')))


#its a little easier for a step later on if I remove all the spaces and such from the column names
df = df.rename(columns={
    'film': 'Film',
    'Opening Weekend': 'Opening_Weekend_Gross',
    'Audience Score': 'Audience_Score',
    'Critic Score': 'Critic_Score',
    'Budget': 'Budget',
    'year': 'Year',
    'category': 'Category',
    'Second Weekend': 'Second_Weekend_Gross', 
    'order': 'Order'
})

# Display dataframe
df = df[['Film', 'Opening_Weekend_Gross', 'Audience_Score', 'Critic_Score', 'Budget', 'Year', 'Category', 'Second_Weekend_Gross', 'Order']]
df.head()

# This didn't matter in excel, but I now want the audience scores and critic scores to be 'scaled up' such that the a unit is 1 point
df['Audience_Score'] = df['Audience_Score'] * 100
df['Critic_Score'] = df['Critic_Score'] * 100


In [2]:
# Prepare the features (X) and target variable (y)
features = df[['Opening_Weekend_Gross', 'Audience_Score', 'Critic_Score', 'Budget', 'Year', 'Category', 'Order']]
target_variable = df['Second_Weekend_Gross']


In [3]:
# Define numeric and categorical columns
numeric_features = ['Opening_Weekend_Gross', 'Audience_Score', 'Critic_Score', 'Budget', 'Year', 'Order']
categorical_features = ['Category']

# I have to do a little bit of processing right here. It's not really cleaning. 
#It's just getting data to look just like how sklearn wants it to
# sklearn is ready for me with a convenient way to OneHotEncode
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse=False), categorical_features)
    ])

# Create the pipeline, which is just a handy tool for the two things I have to do in sklearn
# Sometimes, we will have to do more than two things, and in that case this will be even handier
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_variable, test_size=0.2, random_state=42)

In [43]:
#Lets see the Training Set
print(X_train.shape)
X_train.head()

(24, 7)


Unnamed: 0,Opening_Weekend_Gross,Audience_Score,Critic_Score,Budget,Year,Category,Order
28,187.0,77.0,74.0,200.0,2022,Dr Strange,28
24,71.0,73.0,47.0,200.0,2021,Unique,26
12,179.0,89.0,90.0,250.0,2016,Captain America,13
0,55.0,69.0,67.0,137.5,2008,Unique,2
4,65.0,76.0,77.0,150.0,2011,Thor,4


In [41]:
#Lets see the Test Set
print(X_test.shape
X_test.head()

(6, 7)


Unnamed: 0,Opening_Weekend_Gross,Audience_Score,Critic_Score,Budget,Year,Category,Order
27,181.0,94.0,84.0,250.0,2022,Black Panther,30
15,117.0,87.0,92.0,175.0,2017,Spider-Man,16
23,80.3,80.0,79.0,200.0,2021,Unique,24
17,75.8,80.0,87.0,130.0,2018,Ant-Man,20
8,95.0,92.0,90.0,170.0,2014,Captain America,9


In [47]:
#Lets see the y values
#
print(y_test.shape)
y_test.head()

(6,)


27    66.0
15    44.0
23    25.8
17    29.0
8     41.0
Name: Second_Weekend_Gross, dtype: float64

In [21]:
# Fit the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

#RMSE or root mean squared
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse, " Million")
print("R-squared Score:", r2)

Mean Squared Error: 163.38788903130396
Root Mean Squared Error (RMSE): 12.782327214998995  Million
R-squared Score: 0.029040803790273295


In [5]:
# Get feature names
cat_feature_names = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
feature_names = numeric_features + cat_feature_names.tolist()

# Get coefficients
coefficients = model.named_steps['regressor'].coef_

# Create a dataframe of features and their coefficients
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coef_df = coef_df.sort_values('Coefficient', key=abs, ascending=False)

print("Feature Coefficients:")
print(coef_df)

print("Intercept:", model.named_steps['regressor'].intercept_)


Feature Coefficients:
                     Feature  Coefficient
7     Category_Black Panther    36.609109
6          Category_Avengers    30.301742
11         Category_Iron Man    13.715283
10        Category_Guardians    13.427327
14           Category_Unique    10.335066
13             Category_Thor     9.834342
8   Category_Captain America     5.857483
9        Category_Dr Strange     5.656397
12       Category_Spider-Man     3.508890
4                       Year     1.927579
5                      Order    -0.887092
2               Critic_Score     0.434385
0      Opening_Weekend_Gross     0.320764
1             Audience_Score    -0.101365
3                     Budget    -0.040503
Intercept: -3889.883211954178


In [6]:
# Function to make predictions
def predict_second_weekend(opening_weekend, audience_score, critic_score, budget, year, order, category):
    new_data = pd.DataFrame({
        'Opening_Weekend_Gross': [opening_weekend],
        'Audience_Score': [audience_score],
        'Critic_Score': [critic_score],
        'Budget': [budget],
        'Year': [year],
        'Order': [order], 
        'Category': [category]
    })
    return model.predict(new_data)[0]


In [17]:
# Example prediction
prediction = predict_second_weekend(opening_weekend = 211, audience_score = 95, critic_score = 61, budget= 200, 
                                            year = 2024, order = 40, category = 'Unique')
print("Prediction for Second Weekend:", prediction)


Prediction for Second Weekend: 62.83696361177317
