Importing libraries

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

Importing the train and test datasets

In [2]:
train = pd.read_csv("C:/Users/hgfer/OneDrive/Documentos/Notebooks/TMDB Box Office Prediction/train.csv")
test = pd.read_csv("C:/Users/hgfer/OneDrive/Documentos/Notebooks/TMDB Box Office Prediction/test.csv")

Let's check the information of the datasets

In [3]:
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
id                       3000 non-null int64
belongs_to_collection    604 non-null object
budget                   3000 non-null int64
genres                   2993 non-null object
homepage                 946 non-null object
imdb_id                  3000 non-null object
original_language        3000 non-null object
original_title           3000 non-null object
overview                 2992 non-null object
popularity               3000 non-null float64
poster_path              2999 non-null object
production_companies     2844 non-null object
production_countries     2945 non-null object
release_date             3000 non-null object
runtime                  2998 non-null float64
spoken_languages         2980 non-null object
status                   3000 non-null object
tagline                  2403 non-null object
title                    3000 non-null object
Keywords             

# 1.Data Preparation

Let's put all data together

In [46]:
#dropping the feature revenue from the train dataset
train_drop  = train.drop(columns =['revenue'])

#separting the output data from the train dataset
y = train.revenue

#getting the size of the train dataset
n_train = train.shape[0]

#putting the both datasets together
all_data = pd.concat((train_drop , test), sort = False).reset_index(drop = True)

#info from the new dataset
all_data.info()

#value used top check the most common values in a feature
top_value = 10

#dropping unusual features
#all_data = all_data.drop(columns = ['id','imdb_id','original_title','title'], axis = 1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7398 entries, 0 to 7397
Data columns (total 22 columns):
id                       7398 non-null int64
belongs_to_collection    1481 non-null object
budget                   7398 non-null int64
genres                   7375 non-null object
homepage                 2366 non-null object
imdb_id                  7398 non-null object
original_language        7398 non-null object
original_title           7398 non-null object
overview                 7376 non-null object
popularity               7398 non-null float64
poster_path              7396 non-null object
production_companies     6984 non-null object
production_countries     7241 non-null object
release_date             7397 non-null object
runtime                  7392 non-null float64
spoken_languages         7336 non-null object
status                   7396 non-null object
tagline                  5938 non-null object
title                    7395 non-null object
Keywords           

Now let's check each feature

## 1.1 Belongs to Collection

Let's put 1 in the movies that have collection and 0 in ones that don't

In [8]:
#checking if the movie has collection
all_data['has_collection'] = all_data['belongs_to_collection'].apply(lambda x: 1 if str(x) != 'nan' else 0)

#dropping belongs to collection
all_data = all_data.drop(columns = ['belongs_to_collection'], axis = 1)

## 1.2 Genres

In [9]:
#creating a column for each genre
list_genres = all_data['genres'].apply(lambda x: [i['name'] for i in eval(x)] if str(x) != 'nan' else 0)

#top 10 genres
top_genre = list(dict(Counter(list_genres.apply(pd.Series).stack()).most_common(top_value)).keys())
for genre in top_genre:
    all_data['genre_' + genre] = all_data['genres'].apply(lambda x: 1 if genre in str(x) else 0)

#dropping genres
all_data = all_data.drop(columns = ['genres'], axis = 1)

## 1.3 Homepage

In [10]:
#checking if the movie has homepage
all_data['has_homepage'] = all_data['homepage'].apply(lambda x: 1 if str(x) != 'nan' else 0)

#dropping homepage
all_data = all_data.drop(columns = ['homepage'], axis = 1)

## 1.4 Original Language

In [11]:
#creating a column for each original language
all_data = pd.get_dummies(all_data, columns = ['original_language'], drop_first = True, prefix = 'original_language')

## 1.5 Overview

In [12]:
#checking if the movie has overview
all_data['has_overview'] = all_data['overview'].apply(lambda x: 1 if str(x) != 'nan' else 0)

#dropping overview
all_data = all_data.drop(columns = ['overview'], axis = 1)

## 1.6 Poster Path

In [13]:
#checking if the movie has poster path
all_data['has_poster_path'] = all_data['poster_path'].apply(lambda x: 1 if str(x) != 'nan' else 0)

#dropping poster path
all_data = all_data.drop(columns = ['poster_path'], axis = 1)

## 1.7 Production Companies

In [14]:
#getting the production companies.
#Nan it's a movie that doesn't have production company
list_companies = all_data['production_companies'].apply(lambda x: [i['name'] for i in eval(x)] if str(x) != 'nan' else "nan")

#top 10 production company
top_companies = list(dict(Counter(list_companies.apply(pd.Series).stack()).most_common(top_value)).keys())
for company in top_companies:
    all_data['production_company_' + company] = all_data['production_companies'].apply(lambda x: 1 if company in str(x) else 0)

#droping production companies
all_data = all_data.drop(columns = ['production_companies'], axis = 1)

## 1.8 Production Countries

In [15]:
#getting the production coutries
#Nan it's a movie that doesn't have production country
list_countries = all_data['production_countries'].apply(lambda x: [i['name'] for i in eval(x)] if str(x) != 'nan' else "nan")

#top 10 production countries
top_countries = list(dict(Counter(list_countries.apply(pd.Series).stack()).most_common(top_value)).keys())
for country in top_countries:
    all_data['production_country_' + country] = all_data['production_countries'].apply(lambda x: 1 if country in str(x) else 0)

#dropping production countries

all_data = all_data.drop(columns = ['production_countries'], axis = 1)

## 1.9 Release Date

In [None]:
all_data['release_date'] = pd.to_datetime(all_data.release_date)

## 1.10.Runtime

In [None]:
#replacing the nan values with the mean
all_data['runtime'].fillna(all_data.runtime.mean(), inplace = True)

## 1.11 Spoken Languages

In [16]:
#creating a column for each spoken language
#Nan it's a movie that doesn't have spoken language
list_languages = all_data['spoken_languages'].apply(lambda x: [i['iso_639_1'] for i in eval(x)] if str(x) != 'nan' else 'nan')

#top 10 spoken languages
top_languages = list(dict(Counter(list_languages.apply(pd.Series).stack()).most_common(top_value)).keys())
for language in top_languages:
    all_data['spoken_language_' + language] = all_data['spoken_languages'].apply(lambda x: 1 if language in str(x) else 0)


#dropping spoken languages
all_data = all_data.drop(columns = ['spoken_languages'], axis = 1)


## 1.12 Status

In [17]:
all_data = pd.get_dummies(all_data, columns = ['status'], drop_first = True, prefix = 'status')

## 1.13 Tagline

In [18]:
#checking if the movie has tagline
all_data['has_tagline'] = all_data['tagline'].apply(lambda x: 1 if str(x) != 'nan' else 0)

#dropping poster path
all_data = all_data.drop(columns = ['tagline'], axis = 1)

## 1.14 Keywords

In [19]:
#getting the keywords
list_keywords = all_data['Keywords'].apply(lambda x: [i['name'] for i in eval(x)] if str(x) != 'nan' else 'nan')

#top 10 keywords
top_keywords = list(dict(Counter(list_keywords.apply(pd.Series).stack()).most_common(top_value)).keys())
for keyword in top_keywords:
    all_data['keyword_' + keyword] = all_data['Keywords'].apply(lambda x: 1 if keyword in str(x) else 0)

#droping Keywords
all_data = all_data.drop(columns = ['Keywords'], axis = 1)

## 1.15 Cast

In [20]:
#getting the number of actors in each gender
all_data['cast_gender_0'] = all_data['cast'].apply(lambda x: [i['gender'] == 0 for i in eval(x)].count(True)  if str(x) != 'nan' else 0)
all_data['cast_gender_1'] = all_data['cast'].apply(lambda x: [i['gender'] == 1 for i in eval(x)].count(True)  if str(x) != 'nan' else 0)
all_data['cast_gender_2'] = all_data['cast'].apply(lambda x: [i['gender'] == 2 for i in eval(x)].count(True)  if str(x) != 'nan' else 0)

In [21]:
#getting the cast name
list_cast_names = all_data['cast'].apply(lambda x: [i['name'] for i in eval(x)] if str(x) != 'nan' else 'nan')

#top 10 cast name
top_cast_names = list(dict(Counter(list_cast_names.apply(pd.Series).stack()).most_common(top_value)).keys())
for name in top_cast_names:
    all_data['cast_name_' + name] = all_data['cast'].apply(lambda x: 1 if name in str(x) else 0)
    
#droping cast
all_data = all_data.drop(columns = ['cast'], axis = 1)

## 1.16 Crew

In [22]:
#getting the number of actors in each gender
all_data['crew_gender_0'] = all_data['crew'].apply(lambda x: [i['gender'] == 0 for i in eval(x)].count(True)  if str(x) != 'nan' else 0)
all_data['crew_gender_1'] = all_data['crew'].apply(lambda x: [i['gender'] == 1 for i in eval(x)].count(True)  if str(x) != 'nan' else 0)
all_data['crew_gender_2'] = all_data['crew'].apply(lambda x: [i['gender'] == 2 for i in eval(x)].count(True)  if str(x) != 'nan' else 0)

In [23]:
#getting the crew name
list_crew_names = all_data['crew'].apply(lambda x: [i['name'] for i in eval(x)] if str(x) != 'nan' else 'nan')

#top 10 crew name
top_crew_names = list(dict(Counter(list_crew_names.apply(pd.Series).stack()).most_common(top_value)).keys())
for crew_name in top_crew_names:
    all_data['crew_name_' + crew_name] = all_data['crew'].apply(lambda x: 1 if crew_name in str(x) else 0)
    
#getting the crew department
list_crew_departments = all_data['crew'].apply(lambda x: [i['department'] for i in eval(x)] if str(x) != 'nan' else 'nan')

#top 10 crew_departments
top_crew_departments = list(dict(Counter(list_crew_departments.apply(pd.Series).stack()).most_common(top_value)).keys())
for crew_department in top_crew_departments:
    all_data['crew_department_' + crew_department] = all_data['crew'].apply(lambda x: 1 if crew_department in str(x) else 0)

#getting the crew name
list_crew_jobs = all_data['crew'].apply(lambda x: [i['job'] for i in eval(x)] if str(x) != 'nan' else 'nan')

#top 10 jobs
top_crew_jobs = list(dict(Counter(list_crew_jobs.apply(pd.Series).stack()).most_common(top_value)).keys())
for jobs in top_crew_jobs:
    all_data['crew_job_' + jobs] = all_data['crew'].apply(lambda x: 1 if jobs in str(x) else 0)    
    
#droping crew
all_data = all_data.drop(columns = ['crew'], axis = 1)
