In [1]:
import numpy as np 
import pandas as pd
import os
import gc
import time
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=16)

pd.set_option("max_colwidth", 40)

In [2]:
data_path = "../data/imdb-dataset/"

In [3]:
os.listdir(data_path)

['name.basics.tsv',
 'title.akas.tsv',
 'title.basics.tsv',
 'title.principals.tsv',
 'title.ratings.tsv']

In [4]:
name_basics = pd.read_table(data_path+"name.basics.tsv")

In [5]:
title_akas = pd.read_table(data_path+"title.akas.tsv", low_memory=False)

In [6]:
title_basics = pd.read_table(data_path+"title.basics.tsv", low_memory=False)

In [7]:
title_principals = pd.read_table(data_path+"title.principals.tsv")

In [8]:
title_ratings = pd.read_table(data_path+"title.ratings.tsv")

In [9]:
# convert unknown startYear data to 0 
title_basics.loc[title_basics['startYear']=='\\N','startYear'] = 0
# convert to numeric
title_basics.startYear = title_basics.startYear.astype(np.int64)

In [10]:
df = pd.read_csv("../data/movie_data.tsv", index_col=0)

In [11]:
tmdb = pd.read_csv("../data/tmdb-dataset/movie_tmdb.csv", encoding='utf8', engine='python')

In [12]:
df.tail()

Unnamed: 0,user_name,movie_name,release_year,user_review,user_rating,review_date,rewatched,review_likes
293465,/silentjoe13/,Abduction,2011,I had a feeling this film was going ...,1,"29 Sep, 2011",0,1
293466,/silentjoe13/,GoodFellas,1990,Best gangster film ever. Even surpas...,10,"01 Mar, 2012",0,4
293467,/silentjoe13/,Drive,2011,Probably the best surprise from 2011...,10,"25 Sep, 2011",0,6
293468,/silentjoe13/,Cinema Paradiso,1988,Not only is this one of the greatest...,10,"01 Mar, 2012",0,6
293469,/silentjoe13/,Manhattan,1979,"Witty, whimsical, poignant, and inte...",10,"29 Feb, 2012",0,5


In [13]:
tmdb.loc[~(tmdb.release_date.isnull()),'year'] = tmdb.release_date.apply(lambda x: str(x)[:4])

In [14]:
pd.options.display.max_columns = 50

In [15]:
tmdb['year'] = tmdb.year.fillna(0).astype(np.int64)

In [16]:
tmdb.isnull().any(axis=0)

adult                    False
backdrop_path             True
belongs_to_collection     True
budget                    True
genres                    True
homepage                  True
id                        True
imdb_id                   True
original_language         True
original_title            True
overview                  True
popularity                True
poster_path               True
production_companies      True
production_countries      True
release_date              True
revenue                   True
runtime                   True
spoken_languages          True
status                    True
tagline                   True
title                     True
video                     True
vote_average              True
vote_count                True
year                     False
dtype: bool

In [17]:
name_basics.head(1)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0072308,tt0043044"


In [18]:
title_akas.head(1)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0


In [19]:
title_basics.head(1)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"


In [20]:
title_principals.head(1)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Herself""]"


In [21]:
title_ratings.head(1)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1550


In [22]:
tmdb.columns

Index(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year'],
      dtype='object')

In [23]:
df_v1 = pd.merge(df, title_basics[title_basics.titleType=='movie'],  how='left',
                  left_on=['movie_name','release_year'], 
                  right_on = ['primaryTitle','startYear'])

In [24]:
sum(df_v1.primaryTitle.isnull())

55520

In [25]:
df_v1.shape

(299379, 17)

In [26]:
df.shape

(293470, 8)

In [27]:
print(f'{df_v1.shape[0] - df.shape[0]} entries added due to duplicate matches')

5909 entries added due to duplicate matches


In [28]:
df_v2 = pd.merge(df_v1, tmdb,  how='left',
                     left_on=['movie_name','release_year','tconst'], 
                     right_on = ['title','year','imdb_id'],
                     suffixes=("_orig","_tmdb"))

In [29]:
df_v2.head(1)

Unnamed: 0,user_name,movie_name,release_year,user_review,user_rating,review_date,rewatched,review_likes,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_orig,adult,backdrop_path,belongs_to_collection,budget,genres_tmdb,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,/worsethan/,Star Wars: The Rise of Skywalker,2019,"""I remember when I was with Special ...",10,"21 Dec, 2019",0,187,tt2527338,movie,Star Wars: The Rise of Skywalker,Star Wars: Episode IX - The Rise of ...,0.0,2019.0,\N,155,"Action,Adventure,Fantasy",False,/jOzrELAzFxtMx2I4uDGHOotdfsS.jpg,"{'id': 10, 'name': 'Star Wars Collec...",0,"[{'id': 28, 'name': 'Action'}, {'id'...",https://www.starwars.com/films/star-...,181812.0,tt2527338,en,Star Wars: The Rise of Skywalker,The next installment in the franchis...,34.725,/db32LaOibwEliAmSL2jjDF6oDdj.jpg,"[{'id': 1, 'logo_path': '/o86Dbpburj...","[{'iso_3166_1': 'JO', 'name': 'Jorda...",2019-12-18,0.0,155.0,"[{'iso_639_1': 'en', 'name': 'Englis...",Post Production,Every generation has a legend,Star Wars: The Rise of Skywalker,False,0.0,0.0,2019.0


In [30]:
df_v2.shape

(299390, 43)

In [31]:
print(f'Total of {df_v2.shape[0] - df.shape[0]} entries added due to duplicate matches after combining with tmdb dataset from Kaggle\n\
[https://www.kaggle.com/mathlasker/tmdballmovies]')

Total of 5920 entries added due to duplicate matches after combining with tmdb dataset from Kaggle
[https://www.kaggle.com/mathlasker/tmdballmovies]


In [32]:
print(f'Total titles missing from tmdb dataset: {df_v2.title.isnull().sum()} \n\
Total titles missing from imdb dataset: {df_v1.primaryTitle.isnull().sum()} \n\
Total titles missing after merging both: {df_v2.title.fillna(df_v1.primaryTitle).isnull().sum()}')

Total titles missing from tmdb dataset: 67637 
Total titles missing from imdb dataset: 55520 
Total titles missing after merging both: 24787


In [33]:
df_v2["title_inferred"] = df_v2["title"].fillna(df_v2["primaryTitle"])


In [34]:
df_v2.title_inferred.nunique(), df_v2["title"].nunique(), df_v2["primaryTitle"].nunique()

(34113, 32909, 33093)

In [35]:
df_v2.loc[[11,24,25,29,74,88,89,97],['user_name','movie_name','primaryTitle','title','imdb_id','tconst','runtimeMinutes',
                                        'runtime','user_review']]

Unnamed: 0,user_name,movie_name,primaryTitle,title,imdb_id,tconst,runtimeMinutes,runtime,user_review
11,/worsethan/,Once Upon a Time… in Hollywood,,,,,,,Blasphemously watched this on a post...
24,/worsethan/,Hotel Mumbai,Hotel Mumbai,,,tt5461944,123.0,,"Got that United 93 ""shit happens"" vibe."
25,/worsethan/,Return of the Living Dead Part II,,,,,,,Inferior to the original in every wa...
29,/worsethan/,68 Kill,68 Kill,,,tt5189894,93.0,,Flagrant anti-thot propaganda.
74,/worsethan/,Molly’s Game,,,,,,,The most annoying thing about Sorkin...
88,/worsethan/,Vice,Vice,Vice,tt6266538,tt6266538,132.0,132.0,Like watching someone try to eat a H...
89,/worsethan/,Zama,Zama,Zama,tt3409848,tt3409848,115.0,115.0,Something in my brain rejects movies...
97,/worsethan/,Triangle,Triangle,Triangle,tt1187064,tt1187064,99.0,99.0,I'm realizing these movies just kind...


In [36]:
# SKIP for NOW
# Lets look at duplicate entries
# duplicates = pd.concat(g for _, g in df_v2.groupby("user_review") if len(g) > 1)
# print(f"The number of duplicates: {duplicates.shape[0]}")
# # Peek into duplicates
# duplicates[duplicates.user_review.str.strip()!=""].head()

Looks like most duplicates are from the same name productions from the same year.

In [37]:
common_movies = list(set(df_v2.tconst.unique()).intersection(set(title_principals.tconst.unique())))

In [38]:
pd.isnull(common_movies).sum()

0

In [39]:
length_list = [len(item) for item in common_movies]

In [40]:
np.unique(length_list)

array([ 9, 10])

In [41]:
name_basics.fillna("no_info", inplace=True)

In [42]:
def get_people(imdb_mov_id):
    if imdb_mov_id not in common_movies:
        return "no_info", "no_info", "no_info", "no_info", "no_info"
    temp = title_principals.loc[title_principals['tconst']==imdb_mov_id]
    people_codes = temp["nconst"].values
    row_logical = name_basics.nconst.isin(people_codes)
    names = " ".join([name.replace(" ","") for name in name_basics.loc[row_logical,"primaryName"]])
    birth_years = " ".join(name_basics.loc[row_logical,"birthYear"])
    death_years = " ".join(name_basics.loc[row_logical,"deathYear"])
    professions = " ".join(name_basics.loc[row_logical,"primaryProfession"])
    known_for_movies = " ".join(name.replace(","," ") for name in name_basics.loc[row_logical,"knownForTitles"])
    return names, birth_years, death_years, professions, known_for_movies

In [44]:
# Using single CPU - SLOW!
def generate_people_feats(df, from_scratch=True, from_=0):
    if from_scratch:
        for col in ["names", "birth_years", "death_years", "professions", "known_for"]:
            df[col] = pd.core.series.Series(dtype='object')
    for i_x, mov_id in enumerate(df['tconst'].unique()[from_:]):
        names, birth_years, death_years, professions, known_for_movies = get_people(mov_id)
        logical_idx = df.tconst==mov_id
        df.loc[logical_idx, 'names'] = names
        df.loc[logical_idx, 'birth_years'] = birth_years
        df.loc[logical_idx, 'death_years'] = death_years
        df.loc[logical_idx, 'professions'] = professions
        df.loc[logical_idx, 'known_for'] = known_for_movies
        print("At: " + str(from_+i_x) + f": {mov_id}")
        gc.collect()

In [None]:
# USED to generate people info features, SLOW, keyborad interrupted to continue
generate_people_feats(df_v2)
# Example:
# generate_people_feats(df_cont,from_scratch=False, from_=16000)

In [None]:
# keyborad interrupted saved and continued from where it left
df_v2.to_csv("data_preprocessed.csv")

In [50]:
df_cont = pd.read_csv("data_preprocessed.csv", index_col=0)
# Example:
# generate_people_feats(df_cont,from_scratch=False, from_=16000)

In [56]:
# Continuing from 16k th movie id
codes = df_cont['tconst'].unique()[16000:]
logical = df_cont['tconst'].isin(codes)

In [167]:
print(f"There were {len(df_cont['tconst'].unique())} movie ids which took about 3/4 of a day with a single GPU \n\
The rest of {len(codes)} movie ids had {logical.sum()} entries and took 2.6 hours with 8 CPU threads using pandarallel")

There were 35960 movie ids which took about 3/4 of a day with a single GPU 
The rest of 19960 movie ids had 34426 entries and took 2.6 hours with 8 CPU threads using pandarallel


In [69]:
t1 = time.time()
df_cont.loc[logical, ['names','birth_years','death_years','professions','known_for']] = df_cont.loc[logical,"tconst"].parallel_apply(lambda x: get_people(x))
t2 = time.time()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4304), Label(value='0 / 4304'))), …

In [166]:
print("Total Time: {:.1f} hours".format(round(t2-t1)/3600))

Total Time: 2.6 hours


In [79]:
df_cont.loc[logical].head()

Unnamed: 0,user_name,movie_name,release_year,user_review,user_rating,review_date,rewatched,review_likes,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_orig,adult,backdrop_path,belongs_to_collection,budget,genres_tmdb,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,title_inferred,names,birth_years,death_years,professions,known_for
68140,/justindecloux/,Night of the Howling Beast,1975,Paul Naschy always looks good as a w...,5,"15 Oct, 2018",0,2,tt0073338,movie,Night of the Howling Beast,La maldición de la bestia,0.0,1975.0,\N,94,"Adventure,Horror",False,/4W9rULtHTBVhKqx4JkkUqaKjsgM.jpg,,0.0,"[{'id': 12, 'name': 'Adventure'}, {'...",,54112.0,tt0073338,en,La maldición de la bestia,"Waldemar, the renowned adventurer, j...",3.462,/z1u43lRSv8EkPNjlhnnTlE6GAch.jpg,"[{'id': 4948, 'logo_path': None, 'na...","[{'iso_3166_1': 'ES', 'name': 'Spain'}]",1975-01-01,0.0,94.0,"[{'iso_639_1': 'es', 'name': 'Españo...",Released,Two bloodthirsty beasts in deadly co...,Night of the Howling Beast,False,5.2,5.0,1975.0,Night of the Howling Beast,(CamEspaña CarmenFábregas MiguelIgle...,(CamEspaña CarmenFábregas MiguelIgle...,(CamEspaña CarmenFábregas MiguelIgle...,(CamEspaña CarmenFábregas MiguelIgle...,(CamEspaña CarmenFábregas MiguelIgle...
68141,/justindecloux/,Ninja Zombie,1992,More funny and competent then I expe...,6,"15 Oct, 2018",0,5,tt8184986,movie,Ninja Zombie,Ninja Zombie,0.0,1992.0,\N,87,Action,False,/bJkrxGHjXtxxIl6k449iLtlNRsM.jpg,,0.0,"[{'id': 27, 'name': 'Horror'}, {'id'...",https://www.facebook.com/ninjazombie...,323371.0,tt8184986,en,Ninja Zombie,Assistant professor Orlan Sands is r...,1.434,/4Owj2GbP1Fk83AF51HijtjlZ5px.jpg,"[{'id': 45765, 'logo_path': None, 'n...","[{'iso_3166_1': 'US', 'name': 'Unite...",1992-02-21,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'Englis...",Released,Guts. Gore. Blood- Babes. All in a d...,Ninja Zombie,False,0.0,0.0,1992.0,Ninja Zombie,(PeterBiagi RonForsythe DonaldRasmus...,(PeterBiagi RonForsythe DonaldRasmus...,(PeterBiagi RonForsythe DonaldRasmus...,(PeterBiagi RonForsythe DonaldRasmus...,(PeterBiagi RonForsythe DonaldRasmus...
68144,/justindecloux/,The Damned,2013,I like that it turns into the seriou...,4,"13 Oct, 2018",0,1,tt2387408,movie,The Damned,Gallows Hill,0.0,2013.0,\N,87,"Horror,Mystery,Thriller",False,/mUjLt8b7dKbLHiaVBpSp7AUJXfn.jpg,,3000000.0,"[{'id': 53, 'name': 'Thriller'}, {'i...",,258086.0,tt2387408,en,The Damned,After suffering the recent loss of h...,7.665,/r7rp9wxy8nkCUwro6A792NB8Hb2.jpg,"[{'id': 13921, 'logo_path': None, 'n...","[{'iso_3166_1': 'US', 'name': 'Unite...",2013-10-17,5758519.0,87.0,"[{'iso_639_1': 'en', 'name': 'Englis...",Released,Every Body Has a Secret.,The Damned,False,5.0,123.0,2013.0,The Damned,(PeterFacinelli RichardD'Ovidio Pete...,(PeterFacinelli RichardD'Ovidio Pete...,(PeterFacinelli RichardD'Ovidio Pete...,(PeterFacinelli RichardD'Ovidio Pete...,(PeterFacinelli RichardD'Ovidio Pete...
68145,/justindecloux/,Deathwatch,2002,It's all right. Looks great. Fine pe...,6,"14 Oct, 2018",0,0,tt0286306,movie,Deathwatch,Deathwatch,0.0,2002.0,\N,94,"Drama,Horror,Thriller",False,/8gHzDs3l0RVBhhgOMrJPXmjcgQ0.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id':...",,12576.0,tt0286306,en,Deathwatch,In the brutal trench fighting of the...,6.164,/58zjT8SeELvGzSMNR9MguJZVkFw.jpg,"[{'id': 45443, 'logo_path': None, 'n...","[{'iso_3166_1': 'DE', 'name': 'Germa...",2002-10-06,2270658.0,94.0,"[{'iso_639_1': 'de', 'name': 'Deutsc...",Released,Deliver them from evil.,Deathwatch,False,5.9,92.0,2002.0,Deathwatch,(JamieBell RúaidhríConroy CurtCress ...,(JamieBell RúaidhríConroy CurtCress ...,(JamieBell RúaidhríConroy CurtCress ...,(JamieBell RúaidhríConroy CurtCress ...,(JamieBell RúaidhríConroy CurtCress ...
68154,/justindecloux/,Mega Scorpions,2003,Lots of scorpions. Lots of murdering...,6,"08 Oct, 2018",0,1,tt0366347,movie,Mega Scorpions,Deadly Stingers,0.0,2003.0,\N,73,"Comedy,Horror,Sci-Fi",False,,,150000.0,"[{'id': 878, 'name': 'Science Fictio...",,109563.0,tt0366347,en,Mega Scorpions,Residents of a half-way house are bo...,1.051,/xV6mQX3Ep3tRZC6BxFFmQgkDaA8.jpg,"[{'id': 15352, 'logo_path': None, 'n...",[],2003-03-28,0.0,73.0,"[{'iso_639_1': 'en', 'name': 'Englis...",Released,We're screwed!,Mega Scorpions,False,3.8,3.0,2003.0,Mega Scorpions,(C.CourtneyJoyner J.R.Bookwalter Mac...,(C.CourtneyJoyner J.R.Bookwalter Mac...,(C.CourtneyJoyner J.R.Bookwalter Mac...,(C.CourtneyJoyner J.R.Bookwalter Mac...,(C.CourtneyJoyner J.R.Bookwalter Mac...


In [72]:
df_v3 = pd.merge(df_cont, title_ratings,
                 how='left', 
                 left_on=['tconst'], 
                 right_on = ['tconst'])


In [None]:
# Correcting the mistake which was assigning all data to all new columns names, birth_years and so on)
df_v3.loc[logical,"names"] = df_v3.loc[logical,"names"].parallel_apply(lambda x: x[0])

In [113]:
df_v3.loc[logical].head()

Unnamed: 0,user_name,movie_name,release_year,user_review,user_rating,review_date,rewatched,review_likes,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_orig,adult,backdrop_path,belongs_to_collection,budget,genres_tmdb,homepage,id,imdb_id,...,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,title_inferred,names,birth_years,death_years,professions,known_for,averageRating,numVotes
68140,/justindecloux/,Night of the Howling Beast,1975,Paul Naschy always looks good as a w...,5,"15 Oct, 2018",0,2,tt0073338,movie,Night of the Howling Beast,La maldición de la bestia,0.0,1975.0,\N,94,"Adventure,Horror",False,/4W9rULtHTBVhKqx4JkkUqaKjsgM.jpg,,0.0,"[{'id': 12, 'name': 'Adventure'}, {'...",,54112.0,tt0073338,...,La maldición de la bestia,"Waldemar, the renowned adventurer, j...",3.462,/z1u43lRSv8EkPNjlhnnTlE6GAch.jpg,"[{'id': 4948, 'logo_path': None, 'na...","[{'iso_3166_1': 'ES', 'name': 'Spain'}]",1975-01-01,0.0,94.0,"[{'iso_639_1': 'es', 'name': 'Españo...",Released,Two bloodthirsty beasts in deadly co...,Night of the Howling Beast,False,5.2,5.0,1975.0,Night of the Howling Beast,CamEspaña CarmenFábregas MiguelIgles...,(CamEspaña CarmenFábregas MiguelIgle...,(CamEspaña CarmenFábregas MiguelIgle...,(CamEspaña CarmenFábregas MiguelIgle...,(CamEspaña CarmenFábregas MiguelIgle...,5.3,538.0
68141,/justindecloux/,Ninja Zombie,1992,More funny and competent then I expe...,6,"15 Oct, 2018",0,5,tt8184986,movie,Ninja Zombie,Ninja Zombie,0.0,1992.0,\N,87,Action,False,/bJkrxGHjXtxxIl6k449iLtlNRsM.jpg,,0.0,"[{'id': 27, 'name': 'Horror'}, {'id'...",https://www.facebook.com/ninjazombie...,323371.0,tt8184986,...,Ninja Zombie,Assistant professor Orlan Sands is r...,1.434,/4Owj2GbP1Fk83AF51HijtjlZ5px.jpg,"[{'id': 45765, 'logo_path': None, 'n...","[{'iso_3166_1': 'US', 'name': 'Unite...",1992-02-21,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'Englis...",Released,Guts. Gore. Blood- Babes. All in a d...,Ninja Zombie,False,0.0,0.0,1992.0,Ninja Zombie,PeterBiagi RonForsythe DonaldRasmuss...,(PeterBiagi RonForsythe DonaldRasmus...,(PeterBiagi RonForsythe DonaldRasmus...,(PeterBiagi RonForsythe DonaldRasmus...,(PeterBiagi RonForsythe DonaldRasmus...,5.8,38.0
68144,/justindecloux/,The Damned,2013,I like that it turns into the seriou...,4,"13 Oct, 2018",0,1,tt2387408,movie,The Damned,Gallows Hill,0.0,2013.0,\N,87,"Horror,Mystery,Thriller",False,/mUjLt8b7dKbLHiaVBpSp7AUJXfn.jpg,,3000000.0,"[{'id': 53, 'name': 'Thriller'}, {'i...",,258086.0,tt2387408,...,The Damned,After suffering the recent loss of h...,7.665,/r7rp9wxy8nkCUwro6A792NB8Hb2.jpg,"[{'id': 13921, 'logo_path': None, 'n...","[{'iso_3166_1': 'US', 'name': 'Unite...",2013-10-17,5758519.0,87.0,"[{'iso_639_1': 'en', 'name': 'Englis...",Released,Every Body Has a Secret.,The Damned,False,5.0,123.0,2013.0,The Damned,PeterFacinelli RichardD'Ovidio Peter...,(PeterFacinelli RichardD'Ovidio Pete...,(PeterFacinelli RichardD'Ovidio Pete...,(PeterFacinelli RichardD'Ovidio Pete...,(PeterFacinelli RichardD'Ovidio Pete...,5.2,5698.0
68145,/justindecloux/,Deathwatch,2002,It's all right. Looks great. Fine pe...,6,"14 Oct, 2018",0,0,tt0286306,movie,Deathwatch,Deathwatch,0.0,2002.0,\N,94,"Drama,Horror,Thriller",False,/8gHzDs3l0RVBhhgOMrJPXmjcgQ0.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id':...",,12576.0,tt0286306,...,Deathwatch,In the brutal trench fighting of the...,6.164,/58zjT8SeELvGzSMNR9MguJZVkFw.jpg,"[{'id': 45443, 'logo_path': None, 'n...","[{'iso_3166_1': 'DE', 'name': 'Germa...",2002-10-06,2270658.0,94.0,"[{'iso_639_1': 'de', 'name': 'Deutsc...",Released,Deliver them from evil.,Deathwatch,False,5.9,92.0,2002.0,Deathwatch,JamieBell RúaidhríConroy CurtCress M...,(JamieBell RúaidhríConroy CurtCress ...,(JamieBell RúaidhríConroy CurtCress ...,(JamieBell RúaidhríConroy CurtCress ...,(JamieBell RúaidhríConroy CurtCress ...,6.0,10187.0
68154,/justindecloux/,Mega Scorpions,2003,Lots of scorpions. Lots of murdering...,6,"08 Oct, 2018",0,1,tt0366347,movie,Mega Scorpions,Deadly Stingers,0.0,2003.0,\N,73,"Comedy,Horror,Sci-Fi",False,,,150000.0,"[{'id': 878, 'name': 'Science Fictio...",,109563.0,tt0366347,...,Mega Scorpions,Residents of a half-way house are bo...,1.051,/xV6mQX3Ep3tRZC6BxFFmQgkDaA8.jpg,"[{'id': 15352, 'logo_path': None, 'n...",[],2003-03-28,0.0,73.0,"[{'iso_639_1': 'en', 'name': 'Englis...",Released,We're screwed!,Mega Scorpions,False,3.8,3.0,2003.0,Mega Scorpions,C.CourtneyJoyner J.R.Bookwalter MacA...,(C.CourtneyJoyner J.R.Bookwalter Mac...,(C.CourtneyJoyner J.R.Bookwalter Mac...,(C.CourtneyJoyner J.R.Bookwalter Mac...,(C.CourtneyJoyner J.R.Bookwalter Mac...,4.4,157.0


In [114]:
# Doing the same for the rest of columns in a loop
for i, col in enumerate(['birth_years','death_years','professions','known_for']):
    df_v3.loc[logical,col] = df_v3.loc[logical,col].parallel_apply(lambda x: x[i+1])

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2152), Label(value='0 / 2152'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2152), Label(value='0 / 2152'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2152), Label(value='0 / 2152'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2152), Label(value='0 / 2152'))), …

In [136]:
# Save as "data_raw.csv" file
df_v3.to_csv("data_raw.csv")

## To Do ...

#1 ELIMINATE DUPLICATES BY A SELECTION FUNCTION BY POPULARITY (MORE NON-NULL DATA, ETC.)
#2 [DONE] GENERATE THE PEOPLE FEATURES FOR THE WHOLE DATASET USING MULTI-THREADING...
#3 COMBINE SAME ENTITIES INTO JUST ONE COLUMNS (REMOVE JUNK)


In [157]:
print(f"Missing people data: {df_v3.names.isnull().sum()}")

Missing people data: 55531


In [158]:
df_v3.shape

(299390, 51)

In [160]:
df_v3["user_review"].drop_duplicates().shape

(291298,)

In [161]:
299390 - 291298

8092