## Movie Data Analysis

## Goal of Analysis: Use machine learning algorithms to get a highly accurate prediction for how popular a movie will be given the attributes in the TMDB 5000 Movies Dataset.

In [180]:
# Imports
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.feature_extraction.text import TfidfVectorizer

In [181]:
# Load the TMDB 5000 Movies Dataset
df = pd.read_csv('tmdb_5000_movies.csv')

## Data Exploration

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [183]:
df.describe()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


## Data Cleaning

In [184]:
# Create a copy of the original DataFrame for cleaning
df_clean = df.copy()

In [185]:
# Drop unnecessary columns
columns_to_drop = ['homepage', 'id', 'original_title', 'status']
df_clean.drop(columns=columns_to_drop, inplace=True)

In [186]:
# genres; Extract genre names
df_clean['genres'] = df_clean['genres'].apply(lambda x: [genre['name'] for genre in eval(x)])

In [187]:
# keywords; Extract keywords
df_clean['keywords'] = df_clean['keywords'].apply(lambda x: [keyword['name'] for keyword in eval(x)])

In [188]:
# production_companies; Extract production company names
df_clean['production_companies'] = df_clean['production_companies'].apply(lambda x: [company['name'] for company in eval(x)])

In [189]:
# production_countries; Extract production countries
df_clean['production_countries'] = df_clean['production_countries'].apply(lambda x: [country['name'] for country in eval(x)])

In [190]:
# release_date; Convert to datetime format
df_clean['release_date'] = pd.to_datetime(df_clean['release_date'])

In [191]:
# spoken_languages; Extract spoken languages
df_clean['spoken_languages'] = df_clean['spoken_languages'].apply(lambda x: [language['name'] for language in eval(x)])

In [192]:
# title, overview and tagline; Perform text preprocessing on text data for NLP analysis
stop_words = set(stopwords.words('english')) # words that are insignificant

def preprocess_text(text):
    text = str(text) # convert text to string
    text = text.lower() # convert characters to lowercase
    text = ''.join([char for char in text if char not in string.punctuation]) # remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words]) # remove stop words
    return text

df_clean['title'] = df_clean['title'].apply(preprocess_text)
df_clean['overview'] = df_clean['overview'].apply(preprocess_text)
df_clean['tagline'] = df_clean['tagline'].apply(preprocess_text)


In [193]:
# Perform log transformation for positively-skewed attributes
df_clean['budget'] = np.log1p(df_clean['budget'])
df_clean['popularity'] = np.log1p(df_clean['popularity'])
df_clean['revenue'] = np.log1p(df_clean['revenue'])
df_clean['vote_count'] = np.log1p(df_clean['vote_count'])

In [194]:
# Account for multicollinearity
df_clean.drop('vote_count', axis=1, inplace=True) # vote_count and popularity highly correlated, drop vote_count

df_clean['revenue_budget_ratio'] = np.where( # revenue and budget highly correlated, perform feature engineering
    (df['budget'] != 0) & (df['revenue'] != 0), # if revenue and budget are both nonzero
    df['revenue'] / df['budget'], # calculate the revenue to budget ratio
    0  # else, replace with 0
)
df_clean.drop('revenue', axis=1, inplace=True) # drop the revenue variable
df_clean.drop('budget', axis=1, inplace=True) # drop the budget variable


In [195]:
# Encode categorical variables

mlb = MultiLabelBinarizer() # initialize the MultiLabelBinarizer

genres_encoded = mlb.fit_transform(df_clean['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

keywords_encoded = mlb.fit_transform(df_clean['keywords'])
keywords_df = pd.DataFrame(keywords_encoded, columns=mlb.classes_)

production_companies_encoded = mlb.fit_transform(df_clean['production_companies'])
production_companies_df = pd.DataFrame(production_companies_encoded, columns=mlb.classes_)

production_countries_encoded = mlb.fit_transform(df_clean['production_countries'])
production_countries_df = pd.DataFrame(production_countries_encoded, columns=mlb.classes_)

spoken_languages_encoded = mlb.fit_transform(df_clean['spoken_languages'])
spoken_languages_df = pd.DataFrame(spoken_languages_encoded, columns=mlb.classes_)

# Drop old categorical fields from the dataframe
categorical_fields = ['genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']
df_clean.drop(columns=categorical_fields, inplace=True)

# Reindex the dataframe with encoded categorical columns and non-categorical columns
data = pd.concat([df_clean, genres_df, keywords_df, production_companies_df, production_countries_df, spoken_languages_df], axis=1)

In [196]:
# Normalize the continuous variables
continuous = ['revenue_budget_ratio', 'popularity', 'runtime', 'vote_average']
scaler = StandardScaler()

for var in continuous:
    data[var] = data[var].astype('float64')
    data[var] = scaler.fit_transform(data[[var]])

## Models

In [197]:
# Split data into training and testing sets
X = data.drop(columns=['popularity'])  # features
y = data['popularity']  # target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Linear Regression Model

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_lr_pred = lr_model.predict(X_test)

In [None]:
# NLP Text Analysis

tfidf_vectorizer = TfidfVectorizer()
text_variables = ['title', 'overview', 'tagline']
X_text = tfidf_vectorizer.fit_transform(data[text_variables]) # vectorize text variables

text_model = LinearSVR()
text_model.fit(X_text, y_train)  # fit LinearSVR model on text data

X_test_text = tfidf_vectorizer.transform(data[text_variables])
y_text_pred = text_model.predict(X_test_text)

In [None]:
# Combine predictions and evaluate

weighted_avg_pred = 0.7 * y_lr_pred + 0.3 * y_text_pred # use weighted average

combined_rmse = mean_squared_error(y_test, weighted_avg_pred, squared=False) # evaluate the combined predictions
print(f'Combined Root Mean Squared Error: {combined_rmse}')