In [1]:
#standard libraries
import pandas as pd
import numpy as np

#need this to get data from my googlesheet
from io import StringIO
import matplotlib.pyplot as plt
import requests

#sklearn library is large
#notice how the import is specifying class and funciton
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Construct the export URL
url = 'https://docs.google.com/spreadsheets/d/1p0nIAMW_-cVO4_5AQJ0HQuKd8JAYD-bUnauusrm95Bk/export?format=csv&gid=1808035861'

# Fetch the CSV data
response = requests.get(url)
data = response.content

# Read the CSV data into a DataFrame
df = pd.read_csv(StringIO(data.decode('utf-8')))


#After we read in the csv
#its a little easier for a step later on if I remove all the spaces and such from the column names
df = df.rename(columns={
    'film': 'Film',
    'Opening Weekend': 'Opening_Weekend_Gross',
    'Audience Score': 'Audience_Score',
    'Critic Score': 'Critic_Score',
    'Budget': 'Budget',
    'year': 'Year',
    'category': 'Category',
    'Second Weekend': 'Second_Weekend_Gross', 
    'order': 'Order'
})

rdj_dict = {'Iron Man': 'RDJ',
'Iron Man 2': 'RDJ', 'The Avengers': 'RDJ', 'Iron Man 3': 'RDJ', 
'Avengers: Age of Ultron': 'RDJ', 'Captain America: Civil War': 'RDJ', 
            'Spider-Man: Homecoming': 'RDJ', 'Avengers: Infinity War': 'RDJ', 
            'Avengers: End Game': 'RDJ', 'Incredible Hulk': 'No RDJ', 'Captain America': 'No RDJ', 
            'Thor': 'No RDJ', 'Thor: Dark World': 'No RDJ', 
            'Captain America: Winter Soldier': 'No RDJ',
            'Guardians of the Galaxy': 'No RDJ', 
            'Ant-Man': 'No RDJ', 'Dr Strange': 'No RDJ', 
            'Guardians of the Galaxy 2': 'No RDJ', 'Thor: Ragnarok': 'No RDJ', 
            'Ant-Man & The Wasp': 'No RDJ', 'Black Panther': 'No RDJ', 'Captain Marvel': 'No RDJ', 
            'Spider-Man: Far from Home': 'No RDJ', 'Black Widow': 'No RDJ', 'Eternals': 'No RDJ',
            'Shang-Chi': 'No RDJ', 'Spider-Man: No Way Home': 'No RDJ', 'Black Panther 2': 'No RDJ', 
            'Dr Strange: Multiverse of Madness': 'No RDJ', 'Thor: Love & Thunder': 'No RDJ' }

# Map the RDJ column using the dictionary
df['RDJ'] = df['Film'].map(rdj_dict)
                          
#fill nulls with 0, take them out of the dataset, 
#df.fillna(0, inplace=True)

# This didn't matter in excel, but I now want the audience scores and critic scores to be 'scaled up' such that the a unit is 1 point
df['Audience_Score'] = df['Audience_Score'] * 100
df['Critic_Score'] = df['Critic_Score'] * 100

# Display dataframe
df = df[['Film', 'Opening_Weekend_Gross', 'Audience_Score', 
         'Critic_Score', 'Budget', 'Year', 'Category', 'Second_Weekend_Gross', 'Order', 'RDJ']]
df.head()

Unnamed: 0,Film,Opening_Weekend_Gross,Audience_Score,Critic_Score,Budget,Year,Category,Second_Weekend_Gross,Order,RDJ
0,Incredible Hulk,55.0,69.0,67.0,137.5,2008,Unique,22.1,2,No RDJ
1,Iron Man,102.0,91.0,94.0,186.0,2008,Iron Man,51.2,1,RDJ
2,Iron Man 2,128.0,71.0,71.0,170.0,2010,Iron Man,52.0,3,RDJ
3,Captain America,65.0,75.0,79.0,140.0,2011,Captain America,25.0,5,No RDJ
4,Thor,65.0,76.0,77.0,150.0,2011,Thor,34.0,4,No RDJ


In [2]:
# Prepare the features (X) and target variable (y)
features = df[['Opening_Weekend_Gross', 'Audience_Score', 'Critic_Score', 'Budget', 'Year', 'Category', 'Order', 'RDJ']]
target_variable = df['Second_Weekend_Gross']

In [3]:
# Define numeric and categorical columns
numeric_features = ['Opening_Weekend_Gross', 'Audience_Score', 'Critic_Score', 'Budget', 'Year', 'Order']
categorical_features = ['Category', 'RDJ']

# Create the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse=False), categorical_features)
    ])

from sklearn.ensemble import RandomForestRegressor

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_variable, test_size=0.3, random_state=42)


In [4]:
# Fit the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

#RMSE or root mean squared
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse, " Million")
print("R-squared Score:", r2)

Mean Squared Error: 123.2636084444444
Root Mean Squared Error (RMSE): 11.102414532183726  Million
R-squared Score: 0.5492485355157412


In [5]:
# Get feature names
cat_feature_names = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
feature_names = numeric_features + cat_feature_names.tolist()

# Get coefficients
feature_importances = model.named_steps['regressor'].feature_importances_

# Create a dataframe of features and their coefficients
feat_df = pd.DataFrame({'Feature': feature_names, 'feature_importances': feature_importances})
feat_df = feat_df.sort_values('feature_importances', key=abs, ascending=False)

print("Feature Importances:")
print(feat_df)

Feature Importances:
                     Feature  feature_importances
0      Opening_Weekend_Gross             0.789757
3                     Budget             0.096449
2               Critic_Score             0.049127
6          Category_Avengers             0.029734
5                      Order             0.008240
1             Audience_Score             0.008205
4                       Year             0.007345
12       Category_Spider-Man             0.003851
15                   RDJ_RDJ             0.002300
7     Category_Black Panther             0.001575
11         Category_Iron Man             0.001243
13             Category_Thor             0.000831
10        Category_Guardians             0.000494
9        Category_Dr Strange             0.000288
8   Category_Captain America             0.000266
14           Category_Unique             0.000194
16                   RDJ_nan             0.000101


In [6]:
# Function to make predictions
def predict_second_weekend(opening_weekend, audience_score, critic_score, budget, year, order, category, rdj):
    new_data = pd.DataFrame({
        'Opening_Weekend_Gross': [opening_weekend],
        'Audience_Score': [audience_score],
        'Critic_Score': [critic_score],
        'Budget': [budget],
        'Year': [year],
        'Order': [order], 
        'Category': [category],
        'RDJ': [rdj]
    })
    return model.predict(new_data)[0]


In [7]:
# Deadpool prediction with RDJ
example_prediction = predict_second_weekend(opening_weekend = 300, audience_score = 90 , critic_score = 85, budget= 250, 
                                            year = 2024, order = 40, category = 'Unique', rdj = 'RDJ')
print("Prediction for Second Weekend:", example_prediction)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse, " Million")
print("R-squared Score:", r2)

Prediction for Second Weekend: 100.38499999999998
Mean Squared Error: 123.2636084444444
Root Mean Squared Error (RMSE): 11.102414532183726  Million
R-squared Score: 0.5492485355157412


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score

# Make sure your true values and predicted values are already defined
y_true = y_test  # These are the actual values from the test set
y_pred = model.predict(X_test)  # These are the predicted values from your model

# Calculate the R-squared value
r_squared = r2_score(y_true, y_pred)

# Create scatter plot of actual vs predicted
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_true, y=y_pred)

# Add a diagonal line for perfect predictions (actual == predicted)
sns.lineplot(x=y_true, y=y_true, color='red', linestyle='--')

# Annotate the plot with the R-squared value
plt.text(min(y_true), max(y_pred), f'R² = {r_squared:.2f}', fontsize=12, color='blue')

# Add labels and title
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# Show the plot
plt.show()