# Imports

In [1]:
import pandas as pd
from collections import defaultdict
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Loading data and some stats

In [2]:
movies_data = pd.read_csv("movies_metadata.csv", usecols=['id', 'overview', 'original_language', 'revenue', 'budget', 'genres'])

box_office_data = pd.read_csv("box_office_collections.csv").drop(columns=['Movie Name', 'imdbId'])

  box_office_data = pd.read_csv("box_office_collections.csv").drop(columns=['Movie Name', 'imdbId'])


In [3]:
# Number of rows with missing id
print(movies_data['id'].isna().sum())
print(box_office_data['id'].isna().sum())
print("NaN revenues: ", movies_data['revenue'].isna().sum())
print("0 revenues: ", movies_data['revenue'].eq(0).sum())

print("Revenue range: [", movies_data['revenue'].min(), ", ", movies_data['revenue'].max(), "]")

# Create bins for revenue, first bin with NaN, second with 0s and the rest with equal width
step = movies_data['revenue'].max() / 10
bins = [-0.1, 0.1] + [step * i for i in range(1, 11)]
revenue_bins = pd.cut(movies_data['revenue'], bins)
print("Revenue bins: ", revenue_bins.value_counts())

0
0
NaN revenues:  0
0 revenues:  25061
Revenue range: [ 0.0 ,  8425441842.34856 ]
Revenue bins:  revenue
(-0.1, 0.1]                         25061
(0.1, 842544184.235]                20127
(842544184.235, 1685088368.47]        147
(1685088368.47, 2527632552.705]        14
(2527632552.705, 3370176736.939]        6
(3370176736.939, 4212720921.174]        3
(4212720921.174, 5055265105.409]        1
(5897809289.644, 6740353473.879]        1
(7582897658.114, 8425441842.349]        1
(5055265105.409, 5897809289.644]        0
(6740353473.879, 7582897658.114]        0
Name: count, dtype: int64


In [4]:
# Budget bins
print("NaN budgets: ", movies_data['budget'].isna().sum())
numerical_budgets = movies_data['budget']

print("Numerical budgets: ", numerical_budgets.shape[0])
print("Budget range: [", numerical_budgets.min(), ", ", numerical_budgets.max(), "]")

step = numerical_budgets.max() / 10
bins = [-0.1, 0.1] + [step * i for i in range(1, 11)]
budget_bins = pd.cut(numerical_budgets, bins)
print("Budget bins: ", budget_bins.value_counts())

NaN budgets:  0
Numerical budgets:  45361
Budget range: [ 0 ,  380000000 ]
Budget bins:  budget
(-0.1, 0.1]                   36476
(0.1, 38000000.0]              7374
(38000000.0, 76000000.0]        928
(76000000.0, 114000000.0]       298
(114000000.0, 152000000.0]      163
(152000000.0, 190000000.0]       64
(190000000.0, 228000000.0]       35
(228000000.0, 266000000.0]       19
(266000000.0, 304000000.0]        3
(342000000.0, 380000000.0]        1
(304000000.0, 342000000.0]        0
Name: count, dtype: int64


# Data cleaning

## IDs

In [5]:
print("Shape before ", movies_data.shape)
# movies_data = movies_data[movies_data['id'].str.isnumeric()]
# movies_data['id'] = movies_data['id'].astype(int)
print("Shape after dropping rows with invalid ids: ", movies_data.shape)

Shape before  (45361, 6)
Shape after dropping rows with invalid ids:  (45361, 6)


## Budget

In [6]:
# Add columns budget_unknown
movies_data['budget'] = pd.to_numeric(movies_data['budget'], errors='coerce', downcast='float')
movies_data['budget_unknown'] = movies_data['budget'].apply(lambda x: 1 if x == 0.0 else 0)
movies_data['budget_100M'] = movies_data['budget'] / 1e8
print("Number of non-zero budgets: ", movies_data['budget'].ne(0).sum())
print("Number of unknown budgets: ", movies_data['budget_unknown'].sum())

Number of non-zero budgets:  8885
Number of unknown budgets:  36476


In [7]:
# Stats for budget_100M
print("Range of budget_100M: [", movies_data['budget_100M'].min(), ", ", movies_data['budget_100M'].max(), "]")
print("Mean: ", movies_data['budget_100M'].mean())
print("Median: ", movies_data['budget_100M'].median())
print("Std: ", movies_data['budget_100M'].std())

Range of budget_100M: [ 0.0 ,  3.8 ]
Mean:  0.042339819796080334
Median:  0.0
Std:  0.17442577736732917


## Revenue

In [8]:
print("Shape before ", movies_data.shape)
movies_data.dropna(subset=['revenue'], inplace=True)
movies_data['revenue_100M'] = movies_data['revenue'] / 1e8
print("Shape after dropping rows with NaN revenues and appending revenue_100M column: ", movies_data.shape)

Shape before  (45361, 8)
Shape after dropping rows with NaN revenues and appending revenue_100M column:  (45361, 9)


In [9]:
# Stats for revenue_100M
print("Range of revenue_100M: [", movies_data['revenue_100M'].min(), ", ", movies_data['revenue_100M'].max(), "]")
print("Mean: ", movies_data['revenue_100M'].mean())
print("Median: ", movies_data['revenue_100M'].median())
print("Std: ", movies_data['revenue_100M'].std())

Range of revenue_100M: [ 0.0 ,  84.2544184234856 ]
Mean:  0.2285085935051553
Median:  0.0
Std:  1.2216037817366847


## Overview

In [10]:
movies_data['overview'] = movies_data['overview'].fillna('')

## Country-wise revenues

In [11]:
print("Shape before ", box_office_data.shape)
box_office_data[box_office_data.columns[1:]] = box_office_data[box_office_data.columns[1:]].replace('[\$,]', '', regex=True).astype(float)
# Append revenue_ to all column names except id
new_cols = [(col, 'revenue_'+col) for col in box_office_data.columns[1:]]
box_office_data.rename(columns=dict(new_cols), inplace=True)

print("Shape after ", box_office_data.shape)

Shape before  (10076, 130)
Shape after  (10076, 130)


## Movies metadata after initial cleanup

In [12]:
print("Shape: ", movies_data.shape)
print("dtypes:", movies_data.dtypes)


Shape:  (45361, 9)
dtypes: budget               float64
genres                object
id                     int64
original_language     object
overview              object
revenue              float64
budget_unknown         int64
budget_100M          float64
revenue_100M         float64
dtype: object


In [13]:
movies_data.head()

Unnamed: 0,budget,genres,id,original_language,overview,revenue,budget_unknown,budget_100M,revenue_100M
0,12000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",299782,en,"Orson Welles' unfinished masterpiece, restored...",0.0,0,0.12,0.0
1,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",38700,en,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745
2,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",332283,en,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435
3,0.0,"[{'id': 18, 'name': 'Drama'}]",412059,en,"In forgotten towns along the American border, ...",0.0,1,0.0,0.0
4,18000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,en,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672


## Country-wise revenue after initial cleanup

In [14]:
print("Shape: ", box_office_data.shape)
print("dtypes:", box_office_data.dtypes)

Shape:  (10076, 130)
dtypes: id                        int64
revenue_Argentina       float64
revenue_Aruba           float64
revenue_Australia       float64
revenue_Austria         float64
                         ...   
revenue_E/W Africa      float64
revenue_Laos            float64
revenue_Bosnia          float64
revenue_Soviet Union    float64
revenue_Malta           float64
Length: 130, dtype: object


In [15]:
box_office_data.head()

Unnamed: 0,id,revenue_Argentina,revenue_Aruba,revenue_Australia,revenue_Austria,revenue_Bahrain,revenue_Belgium,revenue_Bolivia,revenue_Brazil,revenue_Bulgaria,...,revenue_Guatemala,revenue_Netherlands Antilles,revenue_North Macedonia,revenue_South Africa/Nigeria,revenue_Switzerland (French/Italian),revenue_E/W Africa,revenue_Laos,revenue_Bosnia,revenue_Soviet Union,revenue_Malta
0,133185,,,1203589.0,482703.0,,2444798.0,4083.0,,,...,,,,,,,,,,
1,133195,360654.0,,1750077.0,479263.0,335561.0,596815.0,68107.0,1051166.0,104255.0,...,,,,,,,,,,
2,133199,,,,,,,,,,...,,,,,,,,,,
3,133219,654232.0,,,,,,,,,...,,,,,,,,,,
4,133225,,,,,,,,1667.0,,...,,,,,,,,,,


# Join with country-wise revenue data

In [16]:
merged_data = pd.merge(movies_data, box_office_data, how='left', on='id')
print("Shape after merge: ", merged_data.shape)
merged_data.head()

Shape after merge:  (45361, 138)


Unnamed: 0,budget,genres,id,original_language,overview,revenue,budget_unknown,budget_100M,revenue_100M,revenue_Argentina,...,revenue_Guatemala,revenue_Netherlands Antilles,revenue_North Macedonia,revenue_South Africa/Nigeria,revenue_Switzerland (French/Italian),revenue_E/W Africa,revenue_Laos,revenue_Bosnia,revenue_Soviet Union,revenue_Malta
0,12000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",299782,en,"Orson Welles' unfinished masterpiece, restored...",0.0,0,0.12,0.0,,...,,,,,,,,,,
1,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",38700,en,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745,,...,,,,,,,,,,
2,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",332283,en,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435,,...,,,,,,,,,,
3,0.0,"[{'id': 18, 'name': 'Drama'}]",412059,en,"In forgotten towns along the American border, ...",0.0,1,0.0,0.0,,...,,,,,,,,,,
4,18000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,en,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672,,...,,,,,,,,,,


# Stats for country-wise revenue columns with mostly missing values

In [17]:
# For each column, count the number of rows with NaN values. Also calculate the number of rows with NaN values in any and all columns.

print("Rows with any Na values: ", merged_data.isna().any(axis=1).sum())
print("Rows with all Na values: ", merged_data.isna().all(axis=1).sum())

x = merged_data.isna().sum()

# Print cols with only a few non-null values
print("Fewer than 10 non-null values ", len(x[x > merged_data.shape[0] - 10]))
print("Fewer than 50 non-null values ", len(x[x > merged_data.shape[0] - 50]))
print("Fewer than 100 non-null values ", len(x[x > merged_data.shape[0] - 100]))
print("Fewer than 500 non-null values ", len(x[x > merged_data.shape[0] - 500]))
print("Fewer than 1000 non-null values ", len(x[x > merged_data.shape[0] - 1000]))

non_na = merged_data.notna().sum()
print("Columns with more than 500 non-null values ", len(non_na[non_na > 500]))
print(non_na[non_na > 500])

print("Count of Na values in each column:")
pd.set_option("display.max_rows", 200)
print(x)
pd.reset_option("display.max_rows")

Rows with any Na values:  45361
Rows with all Na values:  0
Fewer than 10 non-null values  37
Fewer than 50 non-null values  54
Fewer than 100 non-null values  67
Fewer than 500 non-null values  112
Fewer than 1000 non-null values  129
Columns with more than 500 non-null values  26
budget                    45361
genres                    45361
id                        45361
original_language         45350
overview                  45361
revenue                   45361
budget_unknown            45361
budget_100M               45361
revenue_100M              45361
revenue_Argentina           540
revenue_Australia           820
revenue_Austria             549
revenue_Belgium             505
revenue_Domestic            693
revenue_France              778
revenue_Germany             695
revenue_Italy               676
revenue_Mexico              659
revenue_Netherlands         516
revenue_New Zealand         734
revenue_Portugal            550
revenue_Russia/CIS          570
revenue_South

### Remove revenue columns with less than 500 non-null values

In [18]:
# Pick countries with more than 500 country-wise revenue rows
countries = x[x > merged_data.shape[0] - 500].index.tolist()
merged_data.drop(columns=countries, inplace=True)
print("Shape after dropping columns with fewer than 1000 non-null values ", merged_data.shape)
merged_data.head()

Shape after dropping columns with fewer than 1000 non-null values  (45361, 26)


Unnamed: 0,budget,genres,id,original_language,overview,revenue,budget_unknown,budget_100M,revenue_100M,revenue_Argentina,...,revenue_Italy,revenue_Mexico,revenue_Netherlands,revenue_New Zealand,revenue_Portugal,revenue_Russia/CIS,revenue_South Korea,revenue_Spain,revenue_Taiwan,revenue_United Kingdom
0,12000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",299782,en,"Orson Welles' unfinished masterpiece, restored...",0.0,0,0.12,0.0,,...,,,,,,,,,,
1,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",38700,en,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745,,...,,,,,,,,,,
2,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",332283,en,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435,,...,,,,,,,,,,
3,0.0,"[{'id': 18, 'name': 'Drama'}]",412059,en,"In forgotten towns along the American border, ...",0.0,1,0.0,0.0,,...,,,,,,,,,,
4,18000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,en,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672,,...,,,,,,,,,,


## Multi-hot encode Production Countries (skipped for now)

In [19]:
# print("All production_countries values: ", merged_data['production_countries'].unique())

# # Read the array inside each production_countries cell as a list, and convert it into a list of country_ids, where country_id is the index in dictionary built from all unique countries encountered in the list in each cell of production_countries column
# def get_country_isos(production_country):
#     country_isos = []
#     for country in ast.literal_eval(production_country):
#         country_isos.append(country['iso_3166_1'])
#     return country_isos

# merged_data['production_countries_isos'] = merged_data['production_countries'].apply(get_country_isos)

# # multi-hot encode the production_countries column
# mlb = MultiLabelBinarizer()
# mlb.fit(merged_data['production_countries_isos'])
# print("Total number of classes: ", len(mlb.classes_))
# print("Classes: ", mlb.classes_)

# multi_hot_encoded_countries = mlb.transform(merged_data['production_countries_isos'])
# # Create a dataframe with the multi-hot encoded columns, where column names are 'production_country_' + mlb.classes_
# multi_hot_encoded_countries_df = pd.DataFrame(multi_hot_encoded_countries, columns=['production_country_' + country for country in mlb.classes_])

# merged_data = pd.concat([merged_data, multi_hot_encoded_countries_df], axis=1)
# merged_data.drop(columns=['production_countries', 'production_countries_isos'], inplace=True)
# merged_data.head()

## One-hot encode languages

In [20]:
print("All original_language values: ", merged_data['original_language'].unique())

# Convert original_language to one-hot encoding, including NaN values
merged_data = pd.get_dummies(merged_data, columns=['original_language'], dummy_na=True)
merged_data.head()

All original_language values:  ['en' 'hi' 'ja' 'fi' 'ru' 'sv' 'pt' 'ca' 'es' 'fr' 'zh' 'el' 'he' 'de'
 'te' 'it' 'tr' 'ta' 'hu' 'bg' 'da' 'nl' 'pl' 'ko' 'fa' 'sr' 'uk' 'is'
 'tl' 'ar' 'lv' 'ml' 'no' 'id' 'ro' 'hr' 'iu' 'lt' 'cn' 'ms' 'et' 'mr'
 'cs' nan 'ab' 'sl' 'ne' 'pa' 'th' 'kn' 'sq' 'ur' 'vi' 'mk' 'eu' 'ka' 'ky'
 'kk' 'xx' 'mt' 'ku' 'af' 'gl' 'lo' 'nb' 'bn' 'sm' 'bs' 'am' 'ps' 'rw'
 'tg' 'jv' 'bm' 'si' 'mn' 'bo' 'zu' 'lb' 'fy' 'sk' 'uz' 'sh' 'wo' 'cy'
 'la' 'hy' 'ay' 'eo' 'qu']


Unnamed: 0,budget,genres,id,overview,revenue,budget_unknown,budget_100M,revenue_100M,revenue_Argentina,revenue_Australia,...,original_language_tr,original_language_uk,original_language_ur,original_language_uz,original_language_vi,original_language_wo,original_language_xx,original_language_zh,original_language_zu,original_language_nan
0,12000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",299782,"Orson Welles' unfinished masterpiece, restored...",0.0,0,0.12,0.0,,,...,False,False,False,False,False,False,False,False,False,False
1,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",38700,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745,,,...,False,False,False,False,False,False,False,False,False,False
2,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",332283,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435,,,...,False,False,False,False,False,False,False,False,False,False
3,0.0,"[{'id': 18, 'name': 'Drama'}]",412059,"In forgotten towns along the American border, ...",0.0,1,0.0,0.0,,,...,False,False,False,False,False,False,False,False,False,False
4,18000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672,,,...,False,False,False,False,False,False,False,False,False,False


## Multi-hot encode genres

In [21]:
merged_data['genres'].head()

0    [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
1    [{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...
2    [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...
3                        [{'id': 18, 'name': 'Drama'}]
4    [{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...
Name: genres, dtype: object

In [22]:
# Read the array inside each genres cell as a list
def get_genre_list(genres):
    genre_list = []
    for genre in ast.literal_eval(genres):
        genre_list.append(genre['name'])
    return genre_list

merged_data['genres_list'] = merged_data['genres'].apply(get_genre_list)
merged_data.head()

Unnamed: 0,budget,genres,id,overview,revenue,budget_unknown,budget_100M,revenue_100M,revenue_Argentina,revenue_Australia,...,original_language_uk,original_language_ur,original_language_uz,original_language_vi,original_language_wo,original_language_xx,original_language_zh,original_language_zu,original_language_nan,genres_list
0,12000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",299782,"Orson Welles' unfinished masterpiece, restored...",0.0,0,0.12,0.0,,,...,False,False,False,False,False,False,False,False,False,"[Comedy, Drama]"
1,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",38700,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745,,,...,False,False,False,False,False,False,False,False,False,"[Thriller, Action, Crime]"
2,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",332283,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435,,,...,False,False,False,False,False,False,False,False,False,"[Drama, Romance]"
3,0.0,"[{'id': 18, 'name': 'Drama'}]",412059,"In forgotten towns along the American border, ...",0.0,1,0.0,0.0,,,...,False,False,False,False,False,False,False,False,False,[Drama]
4,18000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",302349,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672,,,...,False,False,False,False,False,False,False,False,False,"[Action, Comedy, Fantasy, Science Fiction]"


In [23]:
# multi-hot encode the genres column
genres_mlb = MultiLabelBinarizer()
genres_mlb.fit(merged_data['genres_list'])
print("Total number of genres: ", len(genres_mlb.classes_))
print(genres_mlb.classes_)

multi_hot_encoded_genres = genres_mlb.transform(merged_data['genres_list'])

# Create a dataframe with the multi-hot encoded columns, where column names are 'genre_' + mlb.classes_
multi_hot_encoded_genres_df = pd.DataFrame(multi_hot_encoded_genres, columns=['genre_' + genre for genre in genres_mlb.classes_])

# Append the multi-hot encoded columns to the dataframe and drop the original genres column
merged_data = pd.concat([merged_data, multi_hot_encoded_genres_df], axis=1)
merged_data.drop(columns=['genres', 'genres_list'], inplace=True)

print("Shape after multi-hot encoding genres: ", merged_data.shape)
merged_data.head()

Total number of genres:  20
['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'Foreign' 'History' 'Horror' 'Music' 'Mystery'
 'Romance' 'Science Fiction' 'TV Movie' 'Thriller' 'War' 'Western']
Shape after multi-hot encoding genres:  (45361, 134)


Unnamed: 0,budget,id,overview,revenue,budget_unknown,budget_100M,revenue_100M,revenue_Argentina,revenue_Australia,revenue_Austria,...,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western
0,12000000.0,299782,"Orson Welles' unfinished masterpiece, restored...",0.0,0,0.12,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
1,0.0,38700,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745,,,,...,0,0,0,0,0,0,0,1,0,0
2,0.0,332283,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435,,,,...,0,0,0,0,1,0,0,0,0,0
3,0.0,412059,"In forgotten towns along the American border, ...",0.0,1,0.0,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
4,18000000.0,302349,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672,,,,...,0,0,0,0,0,1,0,0,0,0


In [24]:
# Rows with any NaN values in columns other than country wise revenue columns starting with revenue
non_revenue_cols = [col for col in merged_data.columns if not col.startswith('revenue_')]
print("Rows with any Na values except in revenue cols: ", merged_data[non_revenue_cols].isna().any(axis=1).sum())

Rows with any Na values except in revenue cols:  0


# Training model for 'revenue' column prediction

In [25]:
# Final data
data = merged_data
data.head()

Unnamed: 0,budget,id,overview,revenue,budget_unknown,budget_100M,revenue_100M,revenue_Argentina,revenue_Australia,revenue_Austria,...,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western
0,12000000.0,299782,"Orson Welles' unfinished masterpiece, restored...",0.0,0,0.12,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
1,0.0,38700,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745,,,,...,0,0,0,0,0,0,0,1,0,0
2,0.0,332283,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435,,,,...,0,0,0,0,1,0,0,0,0,0
3,0.0,412059,"In forgotten towns along the American border, ...",0.0,1,0.0,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
4,18000000.0,302349,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672,,,,...,0,0,0,0,0,1,0,0,0,0


In [26]:
data = data[data['revenue_100M'] > 0]
print("Shape after dropping 0 revenue rows ", data.shape)

Shape after dropping 0 revenue rows  (20300, 134)


# Join with cast and crew data

In [27]:
credits_data = pd.read_csv("credits.csv")
print("Shape of credits data: ", credits_data.shape)

Shape of credits data:  (45476, 3)


In [28]:
credits_data.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [29]:
# Join credits_data with data
data = pd.merge(data, credits_data, how='left', on='id')

In [30]:
data.head()

Unnamed: 0,budget,id,overview,revenue,budget_unknown,budget_100M,revenue_100M,revenue_Argentina,revenue_Australia,revenue_Austria,...,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western,cast,crew
0,0.0,38700,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745,,,,...,0,0,0,0,0,1,0,0,"[{'cast_id': 0, 'character': 'Detective Mike L...","[{'credit_id': '585fcddd92514115cd01d8c8', 'de..."
1,0.0,332283,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435,,,,...,0,0,1,0,0,0,0,0,"[{'cast_id': 2, 'character': 'Mary Shelley', '...","[{'credit_id': '588f287ac3a36860d1005f7a', 'de..."
2,18000000.0,302349,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672,,,,...,0,0,0,1,0,0,0,0,"[{'cast_id': 1, 'character': 'Wolfgang Kortzfl...","[{'credit_id': '5461f666c3a3686f4c001f27', 'de..."
3,0.0,245842,King Louis XIV's quest for immortality leads h...,2664181.0,1,0.0,0.026642,,,,...,0,0,0,0,0,0,0,0,"[{'cast_id': 0, 'character': 'King Louis XIV',...","[{'credit_id': '5431de49c3a36825d300007e', 'de..."
4,0.0,341689,"A couple of British 1970s teen-aged boys, Enn ...",460536.4,1,0.0,0.004605,,,,...,1,0,1,1,0,0,0,0,"[{'cast_id': 4, 'character': 'Zan', 'credit_id...","[{'credit_id': '57f11a4e9251416d9c003884', 'de..."


In [31]:
# Get top 1000 cast members' id and name for the dataset
top_cast = defaultdict(int)
def update_cast_count(cast_list):
    for cast in ast.literal_eval(cast_list):
        top_cast[(cast['id'], cast['name'])] += 1

data['cast'].apply(update_cast_count)

# Sort the cast by count and pick top 1000
top_cast = sorted(top_cast.items(), key=lambda x: x[1], reverse=True)[:1000]
top_1000_cast_ids = [cast[0][0] for cast in top_cast]
top_1000_cast_names = [cast[0][1] for cast in top_cast]
print("Top 50 cast members: ", top_1000_cast_names[:50])


Top 50 cast members:  ['Samuel L. Jackson', 'Robert De Niro', 'Bruce Willis', 'Steve Buscemi', 'John Goodman', 'Willem Dafoe', 'Liam Neeson', 'Morgan Freeman', 'Christopher Walken', 'Nicolas Cage', 'John Cusack', 'Gérard Depardieu', 'Stanley Tucci', 'John Turturro', 'Susan Sarandon', 'Michael Caine', 'Harvey Keitel', 'Donald Sutherland', 'Robin Williams', 'John Hurt', 'Woody Harrelson', 'James Franco', 'Robert Duvall', 'John Leguizamo', 'Richard Jenkins', 'Matt Damon', 'Forest Whitaker', 'Paul Giamatti', 'Keith David', 'Julianne Moore', 'J.K. Simmons', 'Johnny Depp', 'Jim Broadbent', 'Danny Glover', 'Whoopi Goldberg', 'Alec Baldwin', 'Ben Kingsley', 'Dan Aykroyd', 'Nick Nolte', 'Anthony Hopkins', 'John Malkovich', 'Sylvester Stallone', 'Danny DeVito', 'William H. Macy', 'Antonio Banderas', 'Stellan Skarsgård', 'Ed Harris', 'Dennis Quaid', 'Jeff Bridges', 'Harry Dean Stanton']


In [32]:
# Get top 500 crew members' id and name for the dataset
top_crew = defaultdict(int)
def update_crew_count(crew_list):
    for crew in ast.literal_eval(crew_list):
        top_crew[(crew['id'], crew['name'])] += 1

data['crew'].apply(update_crew_count)

# Sort the crew by count and pick top 500
top_crew = sorted(top_crew.items(), key=lambda x: x[1], reverse=True)[:500]
top_100_crew_ids = [crew[0][0] for crew in top_crew]
top_100_crew_names = [crew[0][1] for crew in top_crew]
print("Top crew members: ", top_100_crew_names)

Top crew members:  ['Avy Kaufman', 'Mary Vernieu', 'Kerry Barden', 'Deborah Aquila', 'Billy Hopkins', 'Tricia Wood', 'Luc Besson', 'Suzanne Smith', 'Steven Spielberg', 'James Newton Howard', 'Harvey Weinstein', 'Robert Rodriguez', 'Bob Weinstein', 'Hans Zimmer', 'James Horner', 'Woody Allen', 'Steven Soderbergh', 'Jerry Goldsmith', 'Mark Isham', 'Danny Elfman', 'Alan Silvestri', 'Nancy Nayor', 'Scott Rudin', 'Arnon Milchan', 'Francine Maisler', 'Tim Bevan', 'John Williams', 'Nina Gold', 'John Debney', 'Brian Grazer', 'Clint Eastwood', 'John Papsidera', 'Eric Fellner', 'Bonnie Timmermann', 'Mike Fenton', 'Jina Jay', 'J.J. Makaro', 'Francis Ford Coppola', "Dan O'Connell", 'Graeme Revell', 'Denise Chamian', 'Carter Burwell', 'Christopher Young', 'Lynn Stalmaster', 'Joel Coen', 'Alexandre Desplat', 'Marco Beltrami', 'Thomas Newman', 'Christophe Beck', 'Akira Kurosawa', 'Joel Silver', 'John Hughes', 'Janet Hirshenson', 'Jane Jenkins', 'Martin Scorsese', 'Ethan Coen', 'David Newman', 'Bruce 

In [33]:
# Get ids for cast members present in top 1000 cast
def get_top_cast_ids(cast):
    top_cast_ids = []
    for cast_member in ast.literal_eval(cast):
        if cast_member['id'] in top_1000_cast_ids:
            top_cast_ids.append(cast_member['id'])
    return top_cast_ids

def get_important_crew_ids(crew):
    important_crew_ids = []
    for crew_member in ast.literal_eval(crew):
        if crew_member['id'] in top_100_crew_ids:
            important_crew_ids.append(crew_member['id'])
    return important_crew_ids

data['top_cast_ids'] = data['cast'].apply(get_top_cast_ids)
data['important_crew_ids'] = data['crew'].apply(get_important_crew_ids)
data.drop(columns=['cast', 'crew'], inplace=True)
data.head()

Unnamed: 0,budget,id,overview,revenue,budget_unknown,budget_100M,revenue_100M,revenue_Argentina,revenue_Australia,revenue_Austria,...,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western,top_cast_ids,important_crew_ids
0,0.0,38700,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745,,,,...,0,0,0,0,0,1,0,0,"[2888, 78029]",[770]
1,0.0,332283,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435,,,,...,0,0,1,0,0,0,0,0,"[18050, 8435]",[]
2,18000000.0,302349,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672,,,,...,0,0,0,1,0,0,0,0,[1646],[]
3,0.0,245842,King Louis XIV's quest for immortality leads h...,2664181.0,1,0.0,0.026642,,,,...,0,0,0,0,0,0,0,0,"[517, 227, 3052]","[5144, 1617]"
4,0.0,341689,"A couple of British 1970s teen-aged boys, Enn ...",460536.4,1,0.0,0.004605,,,,...,1,0,1,1,0,0,0,0,"[18050, 2227]","[5669, 9027]"


In [34]:
# Multi-hot encode the cast and crew ids
cast_mlb = MultiLabelBinarizer()
crew_mlb = MultiLabelBinarizer()

cast_mlb.fit(data['top_cast_ids'])
print("Total number of cast and crew: ", len(cast_mlb.classes_))
multi_hot_encoded_cast_ids = cast_mlb.transform(data['top_cast_ids'])

crew_mlb.fit(data['important_crew_ids'])
print("Total number of cast and crew: ", len(crew_mlb.classes_))
multi_hot_encoded_crew_ids = crew_mlb.transform(data['important_crew_ids'])

multi_hot_encoded_cast_df = pd.DataFrame(multi_hot_encoded_cast_ids, columns=['cast_' + str(cast_id) for cast_id in cast_mlb.classes_])
multi_hot_encoded_crew_df = pd.DataFrame(multi_hot_encoded_crew_ids, columns=['crew_' + str(crew_id) for crew_id in crew_mlb.classes_])

# Append the multi-hot encoded columns to the dataframe and drop the original cast and crew column
merged_data = pd.concat([data, multi_hot_encoded_cast_df, multi_hot_encoded_crew_df], axis=1)
merged_data.drop(columns=['top_cast_ids', 'important_crew_ids'], inplace=True)

Total number of cast and crew:  1000
Total number of cast and crew:  500


In [35]:
data = merged_data
data.head()

Unnamed: 0,budget,id,overview,revenue,budget_unknown,budget_100M,revenue_100M,revenue_Argentina,revenue_Australia,revenue_Austria,...,crew_1404217,crew_1404244,crew_1424894,crew_1429549,crew_1447543,crew_1456696,crew_1548698,crew_1552521,crew_1733142,crew_1813644
0,0.0,38700,The continuing adventures of Miami detectives ...,497074500.0,1,0.0,4.970745,,,,...,0,0,0,0,0,0,0,0,0,0
1,0.0,332283,The love affair between poet Percy Shelley and...,2443502.0,1,0.0,0.024435,,,,...,0,0,0,0,0,0,0,0,0,0
2,18000000.0,302349,"Twenty years after the events of Iron Sky, the...",467232.7,0,0.18,0.004672,,,,...,0,0,0,0,0,0,0,0,0,0
3,0.0,245842,King Louis XIV's quest for immortality leads h...,2664181.0,1,0.0,0.026642,,,,...,0,0,0,0,0,0,0,0,0,0
4,0.0,341689,"A couple of British 1970s teen-aged boys, Enn ...",460536.4,1,0.0,0.004605,,,,...,0,0,0,0,0,0,0,0,0,0


In [36]:
data.drop(columns=['budget', 'revenue'], inplace=True)

In [37]:
# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [38]:
train_size = train_data.shape[0]
test_size = test_data.shape[0]
print("Train size: ", train_size)
print("Test size: ", test_size)

Train size:  16277
Test size:  4070


In [39]:
# Save train and test data
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

In [40]:
# Save top 1000 cast and top 500 crew name to id mappings
top_cast_df = pd.DataFrame({'id': top_1000_cast_ids, 'name': top_1000_cast_names})
top_cast_df.to_csv("top_cast.csv", index=False)
top_crew_df = pd.DataFrame({'id': top_100_crew_ids, 'name': top_100_crew_names})
top_crew_df.to_csv("top_crew.csv", index=False)

In [41]:
print(genres_mlb.classes_)

['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'Foreign' 'History' 'Horror' 'Music' 'Mystery'
 'Romance' 'Science Fiction' 'TV Movie' 'Thriller' 'War' 'Western']
