In [74]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

movies = pd.read_csv('movies_data.csv')

In [75]:
def convert_to_list(x):
    if isinstance(x, str) and x.strip():  # Check if x is a non-empty string
        return [item.strip() for item in x.split(',')]
    return ['not listed'] 

# Apply the conversion function to movies and production companies
movies['genres'] = movies['genres'].apply(convert_to_list)
movies['production_companies'] = movies['production_companies'].apply(convert_to_list)
movies.head()

Unnamed: 0,id,title,release_date,revenue,budget,production_companies,genres,popularity,vote_average,vote_count,overview,reviews,sentiment
0,5,Four Rooms,1995-12-09,4257354,4000000,"[Miramax, A Band Apart]",[Comedy],13.275,5.8,2618,It's Ted the Bellhop's first night on the job....,No reviews available,0.0
1,6,Judgment Night,1993-10-15,12136938,21000000,"[Largo Entertainment, JVC, Universal Pictures]","[Action, Crime, Thriller]",9.284,6.5,331,"Four young friends, while taking a shortcut en...",No reviews available,0.0
2,11,Star Wars,1977-05-25,775398007,11000000,"[Lucasfilm Ltd., 20th Century Fox]","[Adventure, Action, Science Fiction]",81.543,8.2,20457,Princess Leia is captured and held hostage by ...,"(As I'm writing this review, Darth Vader's the...",0.9998
3,12,Finding Nemo,2003-05-30,940335536,94000000,[Pixar],"[Animation, Family]",85.328,7.819,19102,"Nemo, an adventurous young clownfish, is unexp...",One of the best animated films I have ever see...,0.9976
4,13,Forrest Gump,1994-06-23,677387716,55000000,"[Paramount Pictures, The Steve Tisch Company, ...","[Comedy, Drama, Romance]",106.538,8.472,27251,A man with a low IQ has accomplished great thi...,Best movie ever. This is the best movie ever! ...,0.9963


In [76]:
movies['release_date'] = pd.to_datetime(movies['release_date'])

# Extract year, month, and week number as new columns
movies['release_year'] = movies['release_date'].dt.year
movies['release_month'] = movies['release_date'].dt.month
movies['release_week'] = movies['release_date'].dt.isocalendar().week  # Week number of the year

movies.head()

Unnamed: 0,id,title,release_date,revenue,budget,production_companies,genres,popularity,vote_average,vote_count,overview,reviews,sentiment,release_year,release_month,release_week
0,5,Four Rooms,1995-12-09,4257354,4000000,"[Miramax, A Band Apart]",[Comedy],13.275,5.8,2618,It's Ted the Bellhop's first night on the job....,No reviews available,0.0,1995,12,49
1,6,Judgment Night,1993-10-15,12136938,21000000,"[Largo Entertainment, JVC, Universal Pictures]","[Action, Crime, Thriller]",9.284,6.5,331,"Four young friends, while taking a shortcut en...",No reviews available,0.0,1993,10,41
2,11,Star Wars,1977-05-25,775398007,11000000,"[Lucasfilm Ltd., 20th Century Fox]","[Adventure, Action, Science Fiction]",81.543,8.2,20457,Princess Leia is captured and held hostage by ...,"(As I'm writing this review, Darth Vader's the...",0.9998,1977,5,21
3,12,Finding Nemo,2003-05-30,940335536,94000000,[Pixar],"[Animation, Family]",85.328,7.819,19102,"Nemo, an adventurous young clownfish, is unexp...",One of the best animated films I have ever see...,0.9976,2003,5,22
4,13,Forrest Gump,1994-06-23,677387716,55000000,"[Paramount Pictures, The Steve Tisch Company, ...","[Comedy, Drama, Romance]",106.538,8.472,27251,A man with a low IQ has accomplished great thi...,Best movie ever. This is the best movie ever! ...,0.9963,1994,6,25


In [77]:
# Calculate average revenue by production company
avg_revenue_by_company = movies.explode('production_companies').groupby('production_companies')['revenue'].mean()

# Sort companies by their average revenue
ranked_companies = avg_revenue_by_company.sort_values(ascending=False)

# Define thresholds for each category (you could use percentiles or fixed thresholds)
top_companies = ranked_companies.head(100)  # Top 100 companies by revenue
mid_companies = ranked_companies.iloc[100:1000]  # Mid-level companies
low_companies = ranked_companies.tail(3284-1100)  # Remaining companies

# Create a new column in your DataFrame to categorize each company
def categorize_company(company):
    if company in top_companies.index:
        return 'Top'
    elif company in mid_companies.index:
        return 'Mid-level'
    else:
        return 'Low-level'

# Function to categorize each movie based on the highest-ranking production company
def categorize_production_level(companies):
    categories = [categorize_company(company) for company in companies]
    # Define the ranking order for categories
    rank_priority = {'Top': 3, 'Mid-level': 2, 'Low-level': 1}
    # Get the highest rank based on priority
    highest_category = max(categories, key=lambda x: rank_priority[x])
    return highest_category

# Apply the categorization function to the list of production companies for each movie
movies['production_level'] = movies['production_companies'].apply(categorize_production_level)
movies = movies.drop(columns = ['production_companies', 'release_date'])
movies.head()

Unnamed: 0,id,title,revenue,budget,genres,popularity,vote_average,vote_count,overview,reviews,sentiment,release_year,release_month,release_week,production_level
0,5,Four Rooms,4257354,4000000,[Comedy],13.275,5.8,2618,It's Ted the Bellhop's first night on the job....,No reviews available,0.0,1995,12,49,Mid-level
1,6,Judgment Night,12136938,21000000,"[Action, Crime, Thriller]",9.284,6.5,331,"Four young friends, while taking a shortcut en...",No reviews available,0.0,1993,10,41,Mid-level
2,11,Star Wars,775398007,11000000,"[Adventure, Action, Science Fiction]",81.543,8.2,20457,Princess Leia is captured and held hostage by ...,"(As I'm writing this review, Darth Vader's the...",0.9998,1977,5,21,Top
3,12,Finding Nemo,940335536,94000000,"[Animation, Family]",85.328,7.819,19102,"Nemo, an adventurous young clownfish, is unexp...",One of the best animated films I have ever see...,0.9976,2003,5,22,Top
4,13,Forrest Gump,677387716,55000000,"[Comedy, Drama, Romance]",106.538,8.472,27251,A man with a low IQ has accomplished great thi...,Best movie ever. This is the best movie ever! ...,0.9963,1994,6,25,Top


### Fitting a Linear Regression Pipeline

In [78]:
# Step 1: Fit the MultiLabelBinarizer on the genres data (on the entire dataset)
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movies['genres'])

# Create a DataFrame from the encoded genres
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=movies.index)

# Concatenate the genre DataFrame with the original dataset
movies = pd.concat([movies, genres_df], axis=1)

In [81]:
movie_titles = movies['title']
X = movies.drop(columns=['revenue', 'title', 'id', 'overview', 'reviews', 'genres'])  # Features
y = movies['revenue']  # Target

# Define the categorical and numerical columns
categorical_cols = ['production_level']  # Only 'production_level' as a categorical variable
numerical_cols = ['budget', 'popularity', 'vote_average', 'vote_count', 'sentiment','release_year', 'release_month', 'release_week'] + list(mlb.classes_)

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Scale numeric features
        ('cat', OneHotEncoder(), categorical_cols),  # One-hot encode categorical features
    ])

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)  

print(f'Mean Squared Error (Random Forest): {mse}')

Mean Squared Error (Random Forest): 5009883888307241.0
