## Movie Data Analysis

## Goal of Analysis: Use machine learning algorithms to get a highly accurate prediction for how popular a movie will be given the attributes in the TMDB 5000 Movies Dataset.

In [None]:
# Imports
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import string
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Load the TMDB 5000 Movies Dataset
df = pd.read_csv('tmdb_5000_movies.csv')

## Data Cleaning

In [None]:
# Create a copy of the original DataFrame for cleaning
df_clean = df.copy()

In [None]:
# Drop unnecessary columns
columns_to_drop = ['homepage', 'original_title', 'status']
df_clean.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Set the index to the values in the id column for identification purposes
df_clean.set_index('id', inplace=True)

In [None]:
# genres; Extract genre names
df_clean['genres'] = df_clean['genres'].apply(lambda x: [genre['name'] for genre in eval(x)])

In [None]:
# keywords; Extract keywords
df_clean['keywords'] = df_clean['keywords'].apply(lambda x: [keyword['name'] for keyword in eval(x)])

In [None]:
# production_companies; Extract production company names
df_clean['production_companies'] = df_clean['production_companies'].apply(lambda x: [company['name'] for company in eval(x)])

In [None]:
# production_countries; Extract production countries
df_clean['production_countries'] = df_clean['production_countries'].apply(lambda x: [country['name'] for country in eval(x)])

In [None]:
# release_date; Convert to datetime format
df_clean['release_date'] = pd.to_datetime(df_clean['release_date'])

In [None]:
# spoken_languages; Extract spoken languages
df_clean['spoken_languages'] = df_clean['spoken_languages'].apply(lambda x: [language['name'] for language in eval(x)])

In [None]:
# title, overview and tagline; Perform text preprocessing on text data for NLP analysis
stop_words = set(stopwords.words('english')) # words that are insignificant

def preprocess_text(text):
    text = text.lower() # convert characters to lowercase
    text = ''.join([char for char in text if char not in string.punctuation]) # remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words]) # remove stop words
    return text

df_clean['title'] = df_clean['title'].apply(preprocess_text)
df_clean['overview'] = df_clean['overview'].apply(preprocess_text)
df_clean['tagline'] = df_clean['tagline'].apply(preprocess_text)


In [None]:
# Perform log transformation for positively-skewed attributes
df_clean['budget'] = np.log1p(df_clean['budget'])
df_clean['popularity'] = np.log1p(df_clean['popularity'])
df_clean['revenue'] = np.log1p(df_clean['revenue'])
df_clean['vote_count'] = np.log1p(df_clean['vote_count'])

In [None]:
# Account for multicollinearity
df_clean.drop('vote_count', axis=1, inplace=True) # vote_count and popularity highly correlated, drop vote_count

df_clean['revenue_budget_ratio'] = np.where( # revenue and budget highly correlated, perform feature engineering
    (df['budget'] != 0) & (df['revenue'] != 0), # if revenue and budget are both nonzero
    df['revenue'] / df['budget'], # calculate the revenue to budget ratio
    0  # else, replace with 0
)
df_clean.drop('revenue', axis=1, inplace=True) # drop the revenue variable
df_clean.drop('budget', axis=1, inplace=True) # drop the budget variable


In [None]:
# Encode categorical variables

le = preprocessing.LabelEncoder() # initialize LabelEncoder

# Perform label encoding for categorical variables
df_clean['genres_cat'] = le.fit_transform(df_clean['genres'])
df_clean['keywords_cat'] = le.fit_transform(df_clean['keywords'])
df_clean['production_companies_cat'] = le.fit_transform(df_clean['production_companies'])
df_clean['production_countries_cat'] = le.fit_transform(df_clean['production_countries'])
df_clean['spoken_languages_cat'] = le.fit_transform(df_clean['spoken_languages'])

# Drop old categorical fields from the dataframe
categorical_fields = ['genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']
data = df_clean.drop(columns=categorical_fields, inplace=True)

# Reindex the dataframe with encoded categorical columns and non-categorical columns
encoded_columns = ['genres_cat', 'keywords_cat', 'production_companies_cat', 'production_countries_cat', 'spoken_languages_cat']
data = df_clean.reindex(encoded_columns + df_clean.columns.difference(encoded_columns).tolist(), axis=1)

In [None]:
# Normalize the continuous variables
continuous = ['revenue_budget_ratio', 'popularity', 'runtime', 'vote_average']
scaler = StandardScaler()

for var in continuous:
    data[var] = data[var].astype('float64')
    data[var] = scaler.fit_transform(data[var].values.reshape(-1, 1))

## Models

In [None]:
# Split data into training and testing sets
X = data.drop(columns=['popularity'])  # features
y = data['popularity']  # target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Linear Regression Model

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

In [None]:
# NLP Text Analysis

tfidf_vectorizer = TfidfVectorizer()
text_variables = ['title', 'overview', 'tagline']
X_text = tfidf_vectorizer.fit_transform(data[text_variables]) # vectorize text variables

text_model = LinearSVR()
text_model.fit(X_text, y_train)  # fit LinearSVR model on text data

X_test_text = tfidf_vectorizer.transform(data[text_variables])
y_text_pred = text_model.predict(X_test_text)