In [1]:
import ast
import datetime

import cpi
import matplotlib.pyplot as plt
# Required imports
import numpy as np
import pandas as pd
import seaborn as sns
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Common functions

# Source: https://stackoverflow.com/questions/43214978/seaborn-barplot-displaying-values
def show_values_on_bars(axs, h_v="v", space=0.4):
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax.text(_x, _y, value, ha="center")
        elif h_v == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df

def plot_itemset(title, result, dataset_size, rotation=90):
    plt.figure(figsize=(20,12))
    sns.barplot([", ".join(list(x)) for x in result["itemsets"]], [x * dataset_size for x in result["support"]])
    plt.xticks(fontsize=12,rotation=rotation)
    plt.title(title,fontsize=20)
    plt.show()

def fix_date(date):
    x = pd.to_datetime(date, format="%m/%d/%y")
    if x.year > 2020:
        year = x.year - 100
    else:
        year = x.year
    return datetime.datetime(year,x.month,x.day)

def adjust_price_to_inflation(price, date):
    return int(cpi.inflate(price, date.year))


In [3]:
# Datasets loading
dataset_original = pd.read_csv('./tmdb-box-office-prediction/train.csv')
dataset = dataset_original

oscars = pd.read_csv('./tmdb-box-office-prediction/the_oscar_award.csv')
oscars_winners = oscars[oscars["winner"] == True].drop(columns=["winner", "year_film", "year_ceremony", "ceremony", "category", "name"])
oscars_winners = oscars_winners[oscars_winners["film"].notnull()]
dataset_lc = dataset.copy()
oscars_winners_lc = oscars_winners.copy()
dataset_lc['title'] = dataset_lc['title'].str.lower()
oscars_winners_lc['film'] = oscars_winners_lc['film'].str.lower()
oscar_movies = pd.merge(left=dataset_lc, right=oscars_winners_lc, left_on='title', right_on='film').drop_duplicates(subset='id')
oscar_movies = oscar_movies.drop(columns=['film'])

dataset = text_to_dict(dataset)
dataset["release_date"] = dataset["release_date"].apply(fix_date)
dataset["budget"] = dataset.apply(lambda x: adjust_price_to_inflation(x["budget"], x["release_date"]), axis=1)
dataset["revenue"] = dataset.apply(lambda x: adjust_price_to_inflation(x["revenue"], x["release_date"]), axis=1)

oscar_movies = text_to_dict(oscar_movies)
oscar_movies["release_date"] = oscar_movies["release_date"].apply(fix_date)
oscar_movies["budget"] = oscar_movies.apply(lambda x: adjust_price_to_inflation(x["budget"], x["release_date"]), axis=1)
oscar_movies["revenue"] = oscar_movies.apply(lambda x: adjust_price_to_inflation(x["revenue"], x["release_date"]), axis=1)

In [4]:
print(dataset.info())
print(dataset.describe(include=[np.number]))
dataset.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     3000 non-null   int64         
 1   belongs_to_collection  3000 non-null   object        
 2   budget                 3000 non-null   int64         
 3   genres                 3000 non-null   object        
 4   homepage               946 non-null    object        
 5   imdb_id                3000 non-null   object        
 6   original_language      3000 non-null   object        
 7   original_title         3000 non-null   object        
 8   overview               2992 non-null   object        
 9   popularity             3000 non-null   float64       
 10  poster_path            2999 non-null   object        
 11  production_companies   3000 non-null   object        
 12  production_countries   3000 non-null   object        
 13  rel

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",15101018,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2015-02-20,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",13283126
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",54135944,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,2004-08-06,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",128775114
2,3,{},3563750,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,2014-10-10,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",14138371


In [5]:
feature_vector = ['budget', 'genres', 'homepage', 'belongs_to_collection', 'original_language', 'release_date', 'production_companies', 'production_countries', 'runtime', 'Keywords', 'cast', 'crew', 'revenue']
dataset[feature_vector].head(5)

Unnamed: 0,budget,genres,homepage,belongs_to_collection,original_language,release_date,production_companies,production_countries,runtime,Keywords,cast,crew,revenue
0,15101018,"[{'id': 35, 'name': 'Comedy'}]",,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",en,2015-02-20,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",93.0,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",13283126
1,54135944,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,"[{'id': 107674, 'name': 'The Princess Diaries ...",en,2004-08-06,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",113.0,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",128775114
2,3563750,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,{},en,2014-10-10,"[{'name': 'Bold Films', 'id': 2266}, {'name': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",105.0,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",14138371
3,1336221,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,{},hi,2012-03-09,{},"[{'iso_3166_1': 'IN', 'name': 'India'}]",122.0,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",17816284
4,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,{},ko,2009-02-05,{},"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",118.0,{},"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",4676071


In [6]:
# Data mappers
def check_belongs_to_collection(element):
    return (isinstance(element,list) or isinstance(element,dict)) and len(element) > 0
    
def map_belongs_to_collection(dataset):    
    return dataset['belongs_to_collection'].apply(lambda x: check_belongs_to_collection(x))
    
def check_has_homepage(element):
    return isinstance(element, str) and element.startswith(('http:', 'https:'))
    
def map_has_homepage(dataset):    
    return dataset['homepage'].apply(lambda x: check_has_homepage(x))

def map_release_date_to_release_month(dataset):
    return dataset['release_date'].apply(lambda x: x.month)

def map_runtime(dataset):
    return dataset['runtime'].apply(lambda x: int(x))

def check_production_company(element):
    return (isinstance(element,list) or isinstance(element,dict)) and len(element) > 0
    
def check_production_country(element):
    return (isinstance(element,list) or isinstance(element,dict)) and len(element) > 0

def check_keywords(element):
    return (isinstance(element,list) or isinstance(element,dict)) and len(element) > 0

def check_genres(element):
    return (isinstance(element,list) or isinstance(element,dict)) and len(element) > 0

In [7]:
processed_dataset = dataset.copy()
feature_vector = ['budget', 'genres', 'homepage', 'belongs_to_collection', 'original_language', 'release_date', 'production_companies', 'production_countries', 'runtime', 'Keywords', 'cast', 'crew', 'revenue']
processed_dataset = processed_dataset[feature_vector]
processed_dataset = processed_dataset[processed_dataset['budget'] != 0]
processed_dataset['belongs_to_collection'] = map_belongs_to_collection(processed_dataset)
processed_dataset['homepage'] = map_has_homepage(processed_dataset)
processed_dataset['release_date'] = map_release_date_to_release_month(processed_dataset)
processed_dataset = processed_dataset.rename(columns={'release_date': 'release_month'})
processed_dataset = processed_dataset[processed_dataset['runtime'].notnull()]
processed_dataset['runtime'] = map_runtime(processed_dataset)
#keywordsMask = processed_dataset['Keywords'].apply(lambda x: check_keywords(x))
#processed_dataset = processed_dataset[keywordsMask]
#companiesMask = processed_dataset['production_companies'].apply(lambda x: check_production_company(x))
#processed_dataset = processed_dataset[companiesMask]
#countriesMask = processed_dataset['production_countries'].apply(lambda x: check_production_country(x))
#processed_dataset = processed_dataset[countriesMask]
# genresMask = processed_dataset['genres'].apply(lambda x: check_genres(x))
# processed_dataset = processed_dataset[genresMask]

processed_dataset['genres'] = processed_dataset['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
processed_dataset['production_companies'] = processed_dataset['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
processed_dataset['production_countries'] = processed_dataset['production_countries'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
processed_dataset['Keywords'] = processed_dataset['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
processed_dataset['cast'] = processed_dataset['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
processed_dataset['crew'] = processed_dataset['crew'].apply(lambda x: [i['name'] for i in x if i['job'] == 'Director'] if x != {} else [])
processed_dataset = processed_dataset.rename(columns={'crew': 'directors'})
processed_dataset = processed_dataset.rename(columns={'Keywords': 'keywords'})
processed_dataset = processed_dataset.reset_index(drop=True)

processed_dataset['production_companies_sizes'] = processed_dataset['production_companies'].apply(lambda x: len(x))
processed_dataset['production_countries_sizes'] = processed_dataset['production_countries'].apply(lambda x: len(x))
processed_dataset['keywords_sizes'] = processed_dataset['keywords'].apply(lambda x: len(x))
processed_dataset['cast_sizes'] = processed_dataset['cast'].apply(lambda x: len(x))
processed_dataset['director_sizes'] = processed_dataset['directors'].apply(lambda x: len(x))

processed_dataset

Unnamed: 0,budget,genres,homepage,belongs_to_collection,original_language,release_month,production_companies,production_countries,runtime,keywords,cast,directors,revenue,production_companies_sizes,production_countries_sizes,keywords_sizes,cast_sizes,director_sizes
0,15101018,[Comedy],False,True,en,2,"[Paramount Pictures, United Artists, Metro-Gol...",[United States of America],93,"[time travel, sequel, hot tub, duringcreditsst...","[Rob Corddry, Craig Robinson, Clark Duke, Adam...",[Steve Pink],13283126,3,1,4,24,1
1,54135944,"[Comedy, Drama, Family, Romance]",False,True,en,8,[Walt Disney Pictures],[United States of America],113,"[coronation, duty, marriage, falling in love]","[Anne Hathaway, Julie Andrews, H√©ctor Elizond...",[Garry Marshall],128775114,1,1,4,20,1
2,3563750,[Drama],True,False,en,10,"[Bold Films, Blumhouse Productions, Right of W...",[United States of America],105,"[jazz, obsession, conservatory, music teacher,...","[Miles Teller, J.K. Simmons, Melissa Benoist, ...",[Damien Chazelle],14138371,3,1,12,51,1
3,1336221,"[Thriller, Drama]",True,False,hi,3,[],[India],122,"[mystery, bollywood, police corruption, crime,...","[Vidya Balan, Nawazuddin Siddiqui, Parambrata ...",[Sujoy Ghosh],17816284,0,1,7,7,1
4,18004014,"[Animation, Adventure, Family]",False,False,en,8,[],[],83,[],"[Scott Grimes, Tom Bosley, Rickie Lee Jones, J...",[Hal Sutherland],7340322,0,0,0,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2182,16978824,"[Action, Adventure, Drama, Family, Fantasy]",False,False,en,3,[Douglas Fairbanks Pictures],[United States of America],149,"[male nudity, treasure, magic, palace, flying ...","[Douglas Fairbanks, Snitz Edwards, Charles Bel...",[Raoul Walsh],18148357,1,1,12,21,1
2183,81203917,"[Comedy, Drama]",True,False,en,6,"[DreamWorks SKG, Amblin Entertainment, Parkes/...",[United States of America],128,"[new york, airport, marriage proposal, transla...","[Tom Hanks, Catherine Zeta-Jones, Stanley Tucc...",[Steven Spielberg],296959010,3,1,18,23,1
2184,105912715,"[Crime, Action, Mystery, Thriller]",False,False,en,10,"[New Line Cinema, Forge, The Steve Tisch Company]",[United States of America],120,"[assassination, amnesia, hostage, chase, dark ...","[Geena Davis, Samuel L. Jackson, Yvonne Zima, ...",[Renny Harlin],145763206,3,1,14,15,1
2185,56842742,"[Comedy, Romance]",True,False,en,1,"[Jersey Films, Loofah Productions]",[United States of America],90,"[beach, honeymoon, bride, chance, risk, relati...","[Ben Stiller, Jennifer Aniston, Philip Seymour...",[John Hamburg],232735009,2,1,22,26,1


In [8]:
r = processed_dataset.sort_values('revenue', ascending=False)["cast"].head(500)
r = r.reset_index(drop=True).tolist()
te = TransactionEncoder()
te_ary = te.fit(r).transform(r)
df = pd.DataFrame(te_ary, columns=te.columns_)
res = fpgrowth(df, min_support=0.0015, max_len=3, use_colnames=True)
a = res[res["itemsets"].str.len() >= 2].sort_values('support', ascending=False).reset_index(drop=True).head(50)
a

Unnamed: 0,support,itemsets
0,0.012,"(Lois Maxwell, Desmond Llewelyn)"
1,0.008,"(Peter Cullen, Mark Ryan)"
2,0.008,"(Geoffrey Keen, Walter Gotell, Desmond Llewelyn)"
3,0.008,"(Chris Wedge, Denis Leary)"
4,0.008,"(Reno Wilson, Jess Harnell)"
5,0.008,"(Reno Wilson, John Turturro)"
6,0.008,"(Reno Wilson, Glenn Morshower)"
7,0.008,"(Reno Wilson, Peter Cullen)"
8,0.008,"(Reno Wilson, Josh Duhamel)"
9,0.008,"(Reno Wilson, Mark Ryan)"


In [9]:
itemset_names = {}
for index, item in enumerate(a.itemsets):
    itemset_names[item] = "actor_itemset_" + str(index)
itemset_names

{frozenset({'Desmond Llewelyn', 'Lois Maxwell'}): 'actor_itemset_0',
 frozenset({'Mark Ryan', 'Peter Cullen'}): 'actor_itemset_1',
 frozenset({'Desmond Llewelyn',
            'Geoffrey Keen',
            'Walter Gotell'}): 'actor_itemset_2',
 frozenset({'Chris Wedge', 'Denis Leary'}): 'actor_itemset_3',
 frozenset({'Jess Harnell', 'Reno Wilson'}): 'actor_itemset_4',
 frozenset({'John Turturro', 'Reno Wilson'}): 'actor_itemset_5',
 frozenset({'Glenn Morshower', 'Reno Wilson'}): 'actor_itemset_6',
 frozenset({'Peter Cullen', 'Reno Wilson'}): 'actor_itemset_7',
 frozenset({'Josh Duhamel', 'Reno Wilson'}): 'actor_itemset_8',
 frozenset({'Mark Ryan', 'Reno Wilson'}): 'actor_itemset_9',
 frozenset({'Cheech Marin', 'Tony Shalhoub'}): 'actor_itemset_10',
 frozenset({'Judi Dench',
            'Pierce Brosnan',
            'Samantha Bond'}): 'actor_itemset_11',
 frozenset({'Judi Dench', 'Samantha Bond'}): 'actor_itemset_12',
 frozenset({'Josh Duhamel', 'Mark Ryan', 'Reno Wilson'}): 'actor_itemse

In [12]:
processed_dataset["actors_itemsets"] = processed_dataset["cast"].apply(lambda x: list(filter(None, [itemset_names[itemset] if all(actor in x for actor in itemset) else [] for itemset in a.itemsets])))
for x in processed_dataset['actors_itemsets']:
    print(x)


[]
[]
[]
[]
[]
[]
[]
['actor_itemset_46', 'actor_itemset_47']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['actor_itemset_3', 'actor_itemset_19', 'actor_itemset_22', 'actor_itemset_27', 'actor_itemset_28', 'actor_itemset_29', 'actor_itemset_30', 'actor_itemset_31', 'actor_itemset_32', 'actor_itemset_43']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['actor_itemset_21', 'actor_itemset_24']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['actor_itemset_49']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['actor_itemset_10']
[]
[]
[]
[]
[]
[]
[]
[

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['actor_itemset_20']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['actor_itemset_10']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['actor_itemset_11', 'actor_itemset_12', 'actor_itemset_14', 'actor_itemset_18']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


In [13]:
processed_dataset['actors_itemsets'] = processed_dataset['actors_itemsets'].apply(lambda x: x if x else ['other'])
processed_dataset = processed_dataset.explode('actors_itemsets').reset_index(drop=True)
processed_dataset.shape

(2349, 19)

In [12]:
def generate_occurrence_dict(source_dataset, column_name):
    exploded_dataset = source_dataset.explode(column_name).reset_index(drop=True)
    count = exploded_dataset.groupby([column_name]).size().reset_index(name='counts')
    count_dict = {}
    
    for idx, element in enumerate(count[column_name]):
        count_dict[element] = count[count[column_name] == element]['counts'].iloc[0]
    
    return count_dict

directors_occurrence_dict = generate_occurrence_dict(processed_dataset, 'directors')
cast_occurrence_dict = generate_occurrence_dict(processed_dataset, 'cast')
keywords_occurrence_dict = generate_occurrence_dict(processed_dataset, 'keywords')
production_countries_occurrence_dict = generate_occurrence_dict(processed_dataset, 'production_countries')
production_companies_occurrence_dict = generate_occurrence_dict(processed_dataset, 'production_companies')

In [13]:
processed_dataset = processed_dataset.explode('directors').reset_index(drop=True)

def checkDirector(director):
    try:
        occurrence_number = directors_occurrence_dict[director]
    except:
        occurrence_number = 0
    return occurrence_number >= 8

processed_dataset['director'] = processed_dataset['directors'].apply(lambda x: x if checkDirector(x) else 'other')
processed_dataset['director'].value_counts()

other                 2228
Michael Bay            100
Carlos Saldanha         22
John Glen               16
Chris Wedge             12
Galen T. Chu            10
Steven Spielberg        10
Peter Jackson           10
Mike Thurmeier          10
Clint Eastwood           9
Martin Scorsese          8
Martin Campbell          8
Wes Craven               8
Alfred Hitchcock         8
Ron Howard               8
Paul W.S. Anderson       8
Brian De Palma           8
Steven Soderbergh        8
Name: director, dtype: int64

In [14]:
processed_dataset = processed_dataset.explode('genres').reset_index(drop=True)
processed_dataset = processed_dataset.rename(columns={'genres': 'genre'})
processed_dataset.shape

(6683, 20)

In [15]:
processed_dataset

Unnamed: 0,budget,genre,homepage,belongs_to_collection,original_language,release_month,production_companies,production_countries,runtime,keywords,cast,directors,revenue,production_companies_sizes,production_countries_sizes,keywords_sizes,cast_sizes,director_sizes,actors_itemsets,director
0,15101018,Comedy,False,True,en,2,"[Paramount Pictures, United Artists, Metro-Gol...",[United States of America],93,"[time travel, sequel, hot tub, duringcreditsst...","[Rob Corddry, Craig Robinson, Clark Duke, Adam...",Steve Pink,13283126,3,1,4,24,1,other,other
1,54135944,Comedy,False,True,en,8,[Walt Disney Pictures],[United States of America],113,"[coronation, duty, marriage, falling in love]","[Anne Hathaway, Julie Andrews, H√©ctor Elizond...",Garry Marshall,128775114,1,1,4,20,1,other,other
2,54135944,Drama,False,True,en,8,[Walt Disney Pictures],[United States of America],113,"[coronation, duty, marriage, falling in love]","[Anne Hathaway, Julie Andrews, H√©ctor Elizond...",Garry Marshall,128775114,1,1,4,20,1,other,other
3,54135944,Family,False,True,en,8,[Walt Disney Pictures],[United States of America],113,"[coronation, duty, marriage, falling in love]","[Anne Hathaway, Julie Andrews, H√©ctor Elizond...",Garry Marshall,128775114,1,1,4,20,1,other,other
4,54135944,Romance,False,True,en,8,[Walt Disney Pictures],[United States of America],113,"[coronation, duty, marriage, falling in love]","[Anne Hathaway, Julie Andrews, H√©ctor Elizond...",Garry Marshall,128775114,1,1,4,20,1,other,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6678,56842742,Comedy,True,False,en,1,"[Jersey Films, Loofah Productions]",[United States of America],90,"[beach, honeymoon, bride, chance, risk, relati...","[Ben Stiller, Jennifer Aniston, Philip Seymour...",John Hamburg,232735009,2,1,22,26,1,other,other
6679,56842742,Romance,True,False,en,1,"[Jersey Films, Loofah Productions]",[United States of America],90,"[beach, honeymoon, bride, chance, risk, relati...","[Ben Stiller, Jennifer Aniston, Philip Seymour...",John Hamburg,232735009,2,1,22,26,1,other,other
6680,39779651,Thriller,True,False,en,9,"[Lions Gate Films, Vertigo Entertainment, Goth...",[United States of America],106,"[cia, airport, hero, fight, kidnapping, time b...","[Taylor Lautner, Lily Collins, Alfred Molina, ...",John Singleton,93297097,6,1,14,34,1,other,other
6681,39779651,Action,True,False,en,9,"[Lions Gate Films, Vertigo Entertainment, Goth...",[United States of America],106,"[cia, airport, hero, fight, kidnapping, time b...","[Taylor Lautner, Lily Collins, Alfred Molina, ...",John Singleton,93297097,6,1,14,34,1,other,other


In [16]:
processed_dataset = processed_dataset.explode('production_countries').reset_index(drop=True)

def checkCountry(production_country):
    try:
        occurrence_number = production_countries_occurrence_dict[production_country]
    except:
        occurrence_number = 0
    return occurrence_number >= 50

processed_dataset['production_country'] = processed_dataset['production_countries'].apply(lambda x: x if checkCountry(x) else 'other')
processed_dataset['production_country'].value_counts()

United States of America    5607
other                       1402
United Kingdom               851
France                       404
Germany                      369
Canada                       247
India                        179
Name: production_country, dtype: int64

In [17]:
processed_dataset = processed_dataset.explode('production_companies').reset_index(drop=True)

def checkCompany(production_company):
    try:
        occurrence_number = production_companies_occurrence_dict[production_company]
    except:
        occurrence_number = 0
    return occurrence_number >= 75

processed_dataset['production_company'] = processed_dataset['production_companies'].apply(lambda x: x if checkCompany(x) else 'other')
processed_dataset['production_company'].value_counts()

other                                     27007
Paramount Pictures                          800
Warner Bros.                                657
Universal Pictures                          594
Twentieth Century Fox Film Corporation      540
Di Bonaventura Pictures                     342
Columbia Pictures                           315
Hasbro Studios                              312
Name: production_company, dtype: int64

In [18]:
processed_dataset = processed_dataset.explode('keywords').reset_index(drop=True)

def checkKeywords(keyword):
    try:
        occurrence_number = keywords_occurrence_dict[keyword]
    except:
        occurrence_number = 0
    return occurrence_number >= 75

processed_dataset['keyword'] = processed_dataset['keywords'].apply(lambda x: x if checkKeywords(x) else 'other')
processed_dataset['keyword'].value_counts()

other                   252796
duringcreditsstinger      3169
transformers              1828
murder                    1464
based on novel            1459
woman director            1373
independent film           832
Name: keyword, dtype: int64

In [19]:
processed_dataset = processed_dataset.explode('cast').reset_index(drop=True)

def checkCast(actor):
    try:
        occurrence_number = cast_occurrence_dict[actor]
    except:
        occurrence_number = 0
    return occurrence_number >= 20

processed_dataset['actor'] = processed_dataset['cast'].apply(lambda x: x if checkCast(x) else 'other')
processed_dataset['actor'].value_counts()

other                     6398907
Hugo Weaving                19262
Glenn Morshower             16247
Jess Harnell                15496
John Turturro               15130
                           ...   
Joost Janssen                 768
Helen Iesha Goldthorpe        768
Krista Schaeffer              768
Alan Pietruszewski            768
Vincent Jerome                768
Name: actor, Length: 222, dtype: int64

In [20]:
processed_dataset

Unnamed: 0,budget,genre,homepage,belongs_to_collection,original_language,release_month,production_companies,production_countries,runtime,keywords,...,production_countries_sizes,keywords_sizes,cast_sizes,director_sizes,actors_itemsets,director,production_country,production_company,keyword,actor
0,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,time travel,...,1,4,24,1,other,other,United States of America,Paramount Pictures,other,other
1,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,time travel,...,1,4,24,1,other,other,United States of America,Paramount Pictures,other,other
2,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,time travel,...,1,4,24,1,other,other,United States of America,Paramount Pictures,other,other
3,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,time travel,...,1,4,24,1,other,other,United States of America,Paramount Pictures,other,other
4,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,time travel,...,1,4,24,1,other,other,United States of America,Paramount Pictures,other,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7183315,39779651,Mystery,True,False,en,9,Quick Six Entertainment,United States of America,106,teenager,...,1,14,34,1,other,other,United States of America,other,other,other
7183316,39779651,Mystery,True,False,en,9,Quick Six Entertainment,United States of America,106,teenager,...,1,14,34,1,other,other,United States of America,other,other,other
7183317,39779651,Mystery,True,False,en,9,Quick Six Entertainment,United States of America,106,teenager,...,1,14,34,1,other,other,United States of America,other,other,other
7183318,39779651,Mystery,True,False,en,9,Quick Six Entertainment,United States of America,106,teenager,...,1,14,34,1,other,other,United States of America,other,other,other


In [21]:
processed_dataset.shape

(7183320, 24)

In [22]:
feature_vector = ['budget', 'genre', 'homepage', 'belongs_to_collection', 'original_language', 'release_month', 'production_company', 'production_country', 'runtime', 'keyword', 'actor', 'director', 'actors_itemsets', 'revenue']
processed_dataset = processed_dataset[feature_vector]
processed_dataset.head(20)

Unnamed: 0,budget,genre,homepage,belongs_to_collection,original_language,release_month,production_company,production_country,runtime,keyword,director,actors_itemsets,revenue
0,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
1,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
2,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
3,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
4,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
5,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
6,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
7,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
8,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
9,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126


In [23]:
computatation_dataset = processed_dataset.copy()

In [24]:
computatation_dataset.drop_duplicates(keep = 'first', inplace = True)
computatation_dataset = computatation_dataset.reset_index(drop=True)
computatation_dataset.shape

(15623, 13)

In [25]:
computatation_dataset.to_csv('feature_vector.csv', index=False)
computatation_dataset.head(20)

Unnamed: 0,budget,genre,homepage,belongs_to_collection,original_language,release_month,production_company,production_country,runtime,keyword,director,actors_itemsets,revenue
0,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,other,other,other,13283126
1,15101018,Comedy,False,True,en,2,Paramount Pictures,United States of America,93,duringcreditsstinger,other,other,13283126
2,15101018,Comedy,False,True,en,2,other,United States of America,93,other,other,other,13283126
3,15101018,Comedy,False,True,en,2,other,United States of America,93,duringcreditsstinger,other,other,13283126
4,54135944,Comedy,False,True,en,8,other,United States of America,113,other,other,other,128775114
5,54135944,Drama,False,True,en,8,other,United States of America,113,other,other,other,128775114
6,54135944,Family,False,True,en,8,other,United States of America,113,other,other,other,128775114
7,54135944,Romance,False,True,en,8,other,United States of America,113,other,other,other,128775114
8,3563750,Drama,True,False,en,10,other,United States of America,105,other,other,other,14138371
9,1336221,Thriller,True,False,hi,3,other,India,122,other,other,other,17816284
