In [46]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
from joblib import Parallel, delayed
from math import floor, ceil
import os
import pickle

In [2]:
DATA_DIR = '..\\data\\raw'
INTERIM_DIR = '..\\data\\interim'

### Initial processing of files

In [3]:
df = pd.read_csv('..\\data\\raw\\title.basics.tsv', sep='\t')
title_ratings = pd.read_csv('..\\data\\raw\\title.ratings.tsv', sep='\t')

print("Number of titles : {}".format(len(df)))
print("Number of ratings : {}".format(len(title_ratings)))

  interactivity=interactivity, compiler=compiler, result=result)


Number of titles : 7803001
Number of ratings : 1139448


In [4]:
# Listing all possible titleTypes

df['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'episode'],
      dtype=object)

#### We use only the titles that have a rating associated with them

In [5]:
# Keep only the titles that have a rating associated with them

titles = df.merge(title_ratings, on='tconst')
titles

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.6,1696
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",6.0,210
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1448
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",6.1,122
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.1,2245
...,...,...,...,...,...,...,...,...,...,...,...
1139443,tt9916580,tvEpisode,Horrid Henry Horrid Boy?,Horrid Henry Horrid Boy?,0,2012,\N,10,"Animation,Comedy,Family",7.7,6
1139444,tt9916690,tvEpisode,Horrid Henry Delivers the Milk,Horrid Henry Delivers the Milk,0,2012,\N,10,"Animation,Comedy,Family",6.6,5
1139445,tt9916720,short,The Nun 2,The Nun 2,0,2019,\N,10,"Comedy,Horror,Mystery",6.4,77
1139446,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,\N,43,"Family,Reality-TV",6.9,16


## Strategy

- Framing the problem as predicting movie ratings for 'movie' and 'tvMovie' 
- Convert the 'startYear' field to numeric values and drop the 'endYear' field as it is irrelevant for movies
- Reducing the scope of the problem by filtering movies to the ones that were released after 2010 and before 2020

#### Treating missing values for startYear:

- Currently, we are setting the startYear as 0 for missing values (*Naive approach*). However, a better approach would be to impute the year of the movie from other tables.  

    **For eg**. we have the data on which actors/directors/crew worked on the movie from title.crew and title.principals files. We could collect the crew names and find them in name.basics file. Then we could compute a time when all of these people were alive (a time between [max of their birthYear + some constant] and [min of their deathYear]) and choose a year from that time.  

In [6]:
def process_null_values(df):
    '''
    Dataset has non standard Null value ('\\N')
    Change it to np.NaN so that pandas recognizes it
    
    params:
        - df (pd.DataFrame): Dataframe to process
    
    returns:
        - pd.DataFrame: Dataframe with np.NaN in place of Null
    '''
    return df.applymap(lambda x: np.NaN if (x=='\\N') else x)

In [7]:
# Taking care of NaN values
titles = process_null_values(titles)

del titles['endYear']

# Filtering titletypes
movies = titles[titles['titleType'].isin(['movie', 'tvMovie'])]

# Processing null values in startYear (Naive method as described above)
movies['startYear'] = movies['startYear'].apply(lambda x: int(x) if (not pd.isna(x)) else 0)

# Reducing scope of the problem
movies = movies[(movies['startYear']<2020)&(movies['startYear']>2010)]

# Resetting row numbers
movies.reset_index(drop=True, inplace=True)
movies

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0011216,movie,Spanish Fiesta,La fête espagnole,0,2019,67,Drama,6.3,23
1,tt0016906,movie,Frivolinas,Frivolinas,0,2014,80,"Comedy,Musical",5.6,15
2,tt0019996,movie,Hongxia,Hongxia,0,2011,94,,6.3,52
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama,6.8,6446
4,tt0100275,movie,The Wandering Soap Opera,La Telenovela Errante,0,2017,80,"Comedy,Drama,Fantasy",6.6,266
...,...,...,...,...,...,...,...,...,...,...
90559,tt9916160,movie,Drømmeland,Drømmeland,0,2019,72,Documentary,6.5,42
90560,tt9916192,tvMovie,Danielle Darrieux: Il est poli d'être gai!,Danielle Darrieux: Il est poli d'être gai!,0,2019,53,Biography,7.6,10
90561,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019,,"Adventure,History,War",3.8,12
90562,tt9916460,tvMovie,Pink Taxi,Pink Taxi,0,2019,,Comedy,9.3,15


#### Overview of columns having null values

In [8]:
movies.isnull().sum()

tconst               0
titleType            0
primaryTitle         0
originalTitle        0
isAdult              0
startYear            0
runtimeMinutes    8563
genres             931
averageRating        0
numVotes             0
dtype: int64

#### Treating missing values for genres:

- For treating missing values of genres, the following approaches come to mind:  
  
  
  - **Calculating genres from crew**: (Implementation later in the notebook)
    1. Using the crew and principals file, we could get the people associated (nconsts) with the title
    2. Using the 'knownForTitles' field in the name.basics file, we could get a list of all titles (tconsts) these people are known for.
    3. Using the title.basics file, we now can get the genres associated with the above titles and generate a dictionary of counts of genres that appear in these titles. 
    4. We then take the genres with the top 3 counts (since genres can have at most 3 strings) in that dictionary and associate it with the title.
    
    **NOTE:** We use this approach only when the genres field is empty, we do not do it for records where there are less than 3 strings as it is not logical to do so in my opinion. 
  
      
  - **Using Word embeddings with text of reviews**:  
    1. If we had the text reviews of the movies available, we could use a word embedding to associate the reviews with the available genres. In particular, we could use the words occuring in the reviews and calculate their similarity distance (cosine distance) with each of the genres that we find in our dataset. We thus get a dictionary of genres and their similarity with the reviews. We could again take the genres with the top 3 values in this vector and use those genres. 

In [52]:
# Collecting all possible movie genres by splitting the genres field by commas and adding them to a set

genres = set()
titles['genres'].apply(lambda x: genres.update(x.split(',')) if isinstance(x, str) else False)
genres

{'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western'}

In [54]:
pd.DataFrame({'genres':list(genres)}).to_csv(os.path.join(INTERIM_DIR, 'genres.csv'))

#### Replacing NaN values in runtimeMinutes by mean of the column

In [11]:
mean_runtime = int(movies['runtimeMinutes'].dropna().astype(int).mean())

movies['runtimeMinutes'] = movies['runtimeMinutes'].apply(lambda x: mean_runtime if (pd.isna(x)) else x)

### Processing the names.basics, title.crew, title.principals files for handling the genre missing values

In [12]:
names = pd.read_csv(os.path.join(DATA_DIR, 'name.basics.tsv'), sep='\t')
names = process_null_values(names)
names

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0031983,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934,,"actress,soundtrack,music_department","tt0057345,tt0054452,tt0049189,tt0056404"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0078723,tt0072562,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0060827,tt0050976,tt0069467"
...,...,...,...,...,...,...
10852297,nm9993714,Romeo del Rosario,,,"animation_department,art_department",tt2455546
10852298,nm9993716,Essias Loberg,,,,
10852299,nm9993717,Harikrishnan Rajan,,,cinematographer,tt8736744
10852300,nm9993718,Aayush Nair,,,cinematographer,


In [13]:
names.isnull().sum()

nconst                      0
primaryName                 0
birthYear            10329355
deathYear            10664496
primaryProfession     2294306
knownForTitles        1999277
dtype: int64

In [14]:
# Getting the distribution of the counts of knownforTitles across the dataset

names['knownForTitles'].apply(lambda x: len(x.split(',')) if (not pd.isna(x)) else 0).describe()

count    1.085230e+07
mean     1.562484e+00
std      1.341015e+00
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      6.000000e+00
Name: knownForTitles, dtype: float64

In [15]:
title_crew = pd.read_csv(os.path.join(DATA_DIR, 'title.crew.tsv'), sep='\t')
title_crew = process_null_values(title_crew)
title_crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,
...,...,...,...
7802996,tt9916848,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
7802997,tt9916850,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
7802998,tt9916852,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
7802999,tt9916856,nm10538645,nm6951431


In [16]:
title_principals = pd.read_csv(os.path.join(DATA_DIR, 'title.principals.tsv'), sep='\t')
title_principals = process_null_values(title_principals)
title_principals

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0374658,cinematographer,director of photography,
3,tt0000002,1,nm0721526,director,,
4,tt0000002,2,nm1335271,composer,,
...,...,...,...,...,...,...
44119639,tt9916880,5,nm0996406,director,principal director,
44119640,tt9916880,6,nm1482639,writer,,
44119641,tt9916880,7,nm2586970,writer,books,
44119642,tt9916880,8,nm1594058,producer,producer,


#### We do not need all the records from the title.crew and title.principals files, we just need the ones associated with the movies we are considering

In [17]:
# Filtering the crew and principals dataframes to include only relevant titles

# Add the titles we are considering to a set and then filter the dataframes
considered_titles = set()
considered_titles.update(movies['tconst'].values)

title_principals = title_principals[title_principals['tconst'].isin(considered_titles)]
title_crew = title_crew[title_crew['tconst'].isin(considered_titles)]

print("Length of processed title_crew: {}".format(len(title_crew)))
print("Length of processed title_principals: {}".format(len(title_principals)))

Length of processed title_crew: 90564
Length of processed title_principals: 770180


In [18]:
# Processing title_principals to include only top 5 actors for each movie

# Sorting by ordering so that after groupby the ordering is maintained
title_principals = title_principals.sort_values(by=['tconst', 'ordering'])

len(title_principals.groupby("tconst").head(5))

434627

In [19]:
# We groupby titleIDs and choose the top 5 orderings
title_principals = title_principals.groupby("tconst").head(5)
title_principals

Unnamed: 0,tconst,ordering,nconst,category,job,characters
75890,tt0011216,1,nm0290157,actress,,"[""Soledad""]"
75891,tt0011216,2,nm0300388,actor,,
75892,tt0011216,3,nm0869559,actor,,"[""Miguélan""]"
75893,tt0011216,4,nm0595321,actor,,"[""Réal""]"
75894,tt0011216,5,nm0241273,director,,
...,...,...,...,...,...,...
44118753,tt9916538,1,nm8678236,actress,,"[""Sinta""]"
44118754,tt9916538,2,nm1417182,actress,,"[""Widhi""]"
44118755,tt9916538,3,nm10041459,actor,,"[""Vikash""]"
44118756,tt9916538,4,nm1266058,actress,,"[""Dewi""]"


In [20]:
# Saving files as csvs to interim data folder

title_crew.to_csv(os.path.join(INTERIM_DIR, 'title.crew.csv'), index=False)
title_principals.to_csv(os.path.join(INTERIM_DIR, 'title.principals.csv'), index=False)

#### Filtering the names.basics dataframe for including only the names associated with considered movies for faster processing

**NOTE**: This filtering is only for saving time in prototyping further in the pipeline as I plan to calculate features for each person. In production, I would retain the entire name list and calculate the features on them as well in case a movie has a crew member who hasnt worked in a movie before (eg. an actor who only did TV series up until now).

**Observation:** IMDB lists the top billed actors ('star cast' or protagonist) first and the list decreases in order of importance

**Design decision:** From the cast of the movie, for genres computation, we only consider the **top 5 names** ('star cast' or actors playing most important characters) to reduce our calculations significantly. We already processed _title_principles_ to allow the same

In [21]:
considered_name_ids = set()

def split_nameIDs(nconst_str):
    '''
    Split the nconst string by commas and filter out empty strings
    
    params:
        - nconst_str (str): String of comma separated nameIDs
    
    returns:
        - (list): List of nameIDs
    '''
    return [nameID for nameID in nconst_str.split(',') if nameID!='']

# Adding all directors
title_crew['directors'].dropna().apply(lambda x: considered_name_ids.update(split_nameIDs(x)))

# Adding all writers
title_crew['writers'].dropna().apply(lambda x: considered_name_ids.update(split_nameIDs(x)))

# Adding principal crew members
considered_name_ids.update(title_principals['nconst'].dropna().values)

print(len(considered_name_ids))

301313


In [22]:
names = names[names['nconst'].isin(considered_name_ids)]
names.reset_index(drop=True, inplace=True)
names

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0031983,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934,,"actress,soundtrack,music_department","tt0057345,tt0054452,tt0049189,tt0056404"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0078723,tt0072562,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0060827,tt0050976,tt0069467"
...,...,...,...,...,...,...
301300,nm9993616,Ryan Mac Lennan,,,actor,tt4844148
301301,nm9993650,Marcin Balcerak,,,actor,tt8739208
301302,nm9993690,David Jewell,,,,tt7888884
301303,nm9993691,Ursula Gehrmann,,,,tt7888884


In [23]:
names[names['primaryName']=='Bradley Cooper']

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
13054,nm0177896,Bradley Cooper,1975,,"actor,producer,soundtrack","tt1045658,tt1517451,tt1800241,tt2179136"


In [24]:
names.to_csv(os.path.join(INTERIM_DIR, 'names.basics.csv'))

### Pipeline for processing NaN values in genres

In [25]:
genre_null_index = movies[movies['genres'].isna()].index

In [26]:
movies.iloc[list(genre_null_index)]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes
2,tt0019996,movie,Hongxia,Hongxia,0,2011,94,,6.3,52
10,tt0137818,movie,Housesitter: The Night They Saved Siegfried's ...,Housesitter: The Night They Saved Siegfried's ...,0,2018,95,,4.1,21
18,tt0195933,movie,Mysteries,Mysteries,0,2019,92,,7.0,6
26,tt0282252,movie,Voulez-vous coucher avec God?,Voulez-vous coucher avec God?,0,2011,69,,5.6,13
33,tt0306058,movie,Second Coming,Second Coming,0,2012,95,,5.7,21
...,...,...,...,...,...,...,...,...,...,...
90450,tt9885292,movie,Mi querida España,Mi querida España,0,2015,92,,7.5,11
90494,tt9897894,movie,Footloose in Ireland: Dingle Way & Dublin,Footloose in Ireland: Dingle Way & Dublin,0,2018,111,,5.5,6
90510,tt9902734,movie,Josh Groban Bridges Live from Madison Square G...,Josh Groban Bridges Live from Madison Square G...,0,2019,92,,7.2,13
90525,tt9905932,movie,Footloose in London: All the Best Sights of ou...,Footloose in London: All the Best Sights of ou...,0,2016,106,,7.0,5


In [28]:
df = process_null_values(df)

In [29]:
def get_genres(tconst, debug=False):
    '''
    Given a tconst that has no genre associated with it, 
    return a string with 3 genres comma-separated estimated
    using the top 3 genres its crew is involved in.
    
    Steps:
    1. Get the list of people involved in the movie
    2. Get the titles that they are known for from the names data
    3. Get the list of genres that these titles represent from titles data
    4. Pick the 3 genres that are most common in this list and return
    
    params:
        - tconst (str): alphanumeric titleID
        - debug (bool): debug flag to enable print statements
        
    returns:
        - (str): top-3 genres comma-separated
    '''
    
    title_crew = pd.read_csv(os.path.join(INTERIM_DIR, 'title.crew.csv'), usecols=['tconst', 'directors', 'writers'])
    title_principals = pd.read_csv(os.path.join(INTERIM_DIR, 'title.principals.csv'), usecols=['tconst', 'nconst'])
    names = pd.read_csv(os.path.join(INTERIM_DIR, 'names.basics.csv'), usecols=['nconst', 'knownForTitles'])
    
    # Set for storing relevant people
    people_involved = set()
    
    # Adding top 5 actors associated with the movie to the set
    people_involved.update(title_principals[title_principals['tconst']==tconst]['nconst'].values[:5])
    
    # Adding writers and directors
    crew = title_crew[title_crew['tconst']==tconst]
    
    if not pd.isna(crew['writers'].iloc[0]):
        people_involved.update(split_nameIDs(crew['writers'].iloc[0]))
    
    if not pd.isna(crew['directors'].iloc[0]):
        people_involved.update(split_nameIDs(crew['directors'].iloc[0]))
    
    # Choose the people involved in the names list
    people_involved_df = names[names['nconst'].isin(people_involved)]
    
    if debug:
        print(people_involved_df)
    
    # We just need the titles that they are collectively known for
    people_titles = people_involved_df['knownForTitles'].dropna()
    
    people_titles_set = set()
    people_titles.apply(lambda x: people_titles_set.update(split_nameIDs(x)))
    
    if debug:
        print(people_titles_set)
    
    # Filter these records from title.basics file to get all the genres associated with them
    genres = df[df['tconst'].isin(people_titles_set)]['genres'].dropna()
    genres_list = []
    genres.apply(lambda x: genres_list.extend(split_nameIDs(x)))
    genres = Counter(genres_list)
    
    if debug:
        print(genres)
        print(genres.most_common(3))
    
    # Pick 3 most commen genres 
    genres = [genre for (genre, count) in genres.most_common(3)]
    
    return ','.join(genres)

#### Trying out the get_genres() function

In [30]:
get_genres('tt0019996', debug=True)

          nconst                             knownForTitles
17512  nm0266662    tt0057713,tt5348132,tt0355037,tt0206738
47791  nm0920761   tt0205557,tt11439364,tt0205573,tt0349368
48898  nm0946300  tt0369552,tt10452668,tt13770570,tt0348490
49389  nm0955817    tt0192690,tt1851091,tt0453321,tt0198850
{'tt0057713', 'tt0205573', 'tt10452668', 'tt0348490', 'tt0453321', 'tt0349368', 'tt0192690', 'tt0205557', 'tt5348132', 'tt11439364', 'tt13770570', 'tt0198850', 'tt1851091', 'tt0206738', 'tt0369552', 'tt0355037'}
Counter({'Drama': 12, 'Action': 2, 'Romance': 2, 'Thriller': 1, 'History': 1, 'Mystery': 1, 'War': 1, 'Musical': 1})
[('Drama', 12), ('Action', 2), ('Romance', 2)]


'Drama,Action,Romance'

#### Parallel implementation using joblib

In [31]:
import itertools 
def get_genres_parallel(tconst_list, verbose=30 ,debug=False):
    '''
    Implement the get_genres function parallely
    
    params:
        - tconst_list (list): List of titleID tuples (index, tconst)
        - verbose      (int): verbosity of Parallel execution
        - debug       (bool): debug flag for print statements
    
    returns:
        - result (list of tuples) where each tuple is (index, str)
          Index and string representing 3 genres estimated for the movie
          comma-separated
    '''
    # Get the number of logical CPU cores
    num_cores = os.cpu_count()
    
    # Determine load for each core
    items_per_core = ceil(len(tconst_list)/num_cores)
    
    # Distribute load on each core
    list_per_core = [tconst_list[i*items_per_core: (i+1)*items_per_core]
                     for i in range(num_cores)]
    
    if debug:
        print([len(list_) for list_ in list_per_core])
    
    # Read relevant data from csv files
    title_crew = pd.read_csv(os.path.join(INTERIM_DIR, 'title.crew.csv'), usecols=['tconst', 'directors', 'writers'])
    title_principals = pd.read_csv(os.path.join(INTERIM_DIR, 'title.principals.csv'), usecols=['tconst', 'nconst'])
    names = pd.read_csv(os.path.join(INTERIM_DIR, 'names.basics.csv'), usecols=['nconst', 'knownForTitles'])
    
    # Parallel execution
    genres_mapping = Parallel(n_jobs=-1,verbose=verbose, max_nbytes=None)(delayed(get_genres_core)(tc_list, title_crew.copy(),title_principals.copy(),names.copy())
                                    for tc_list in list_per_core)
    
    # Collect and aggregate results from all parallel tasks
    result = []
    [result.extend(list_) for list_ in genres_mapping if list_!=[]]
    
    return result 

In [32]:
def get_genres_core(tconst_list, title_crew, title_principals, names, debug=False):
    '''
    Given a tconst that has no genre associated with it, 
    return a string with 3 genres comma-separated estimated
    using the top 3 genres its crew is involved in.
    
    Steps: (for each titleID)
    1. Get the list of people involved in the movie
    2. Get the titles that they are known for from the names data
    3. Get the list of genres that these titles represent from titles data
    4. Pick the 3 genres that are most common in this list and return
    
    params:
        - tconst_list (list of tuples): list of (index, titleID)
        - title_crew       (DataFrame): dataframe containing crew for title
        - title_principals (DataFrame): dataframe containing cast for title
        - names            (DataFrame): dataframe containing info for names
        - debug                 (bool): debug flag for print statements
        
    returns:
        - output      (list of tuples): list of tuples (index, genres string)
    '''
    
    output = []
    
    for (ind, tconst) in tconst_list:
        
        people_involved = set()
    
        # Adding top 5 actors associated with the movie to the set
        people_involved.update(title_principals[title_principals['tconst']==tconst]['nconst'].values[:5])

        # Adding writers and directors
        crew = title_crew[title_crew['tconst']==tconst]

        if not pd.isna(crew['writers'].iloc[0]):
            people_involved.update(split_nameIDs(crew['writers'].iloc[0]))

        if not pd.isna(crew['directors'].iloc[0]):
            people_involved.update(split_nameIDs(crew['directors'].iloc[0]))

        # Choose the people involved in the names list
        people_involved_df = names[names['nconst'].isin(people_involved)]

        if debug:
            print(people_involved_df)

        # We just need the titles that they are collectively known for
        people_titles = people_involved_df['knownForTitles'].dropna()

        people_titles_set = set()
        people_titles.apply(lambda x: people_titles_set.update(split_nameIDs(x)))

        if debug:
            print(people_titles_set)

        # Filter these records from title.basics file to get all the genres associated with them
        genres = df[df['tconst'].isin(people_titles_set)]['genres'].dropna()
        genres_list = []
        genres.apply(lambda x: genres_list.extend(split_nameIDs(x)))
        genres = Counter(genres_list)

        if debug:
            print(genres)
            print(genres.most_common(3))

        genres = [genre for (genre, count) in genres.most_common(3)]
        
        # If there are no genres associated with the crew assign 'Drama'
        if len(genres)==0:
            genres = ['Drama']
        
        output.append((ind, ','.join(genres)))
    
    return output

#### Executing code

In [33]:
# Get the indices of the rows with no genres
no_gen_ind = movies.index[movies['genres'].isna()].tolist()

# get the titleIDs of those rows 
tconst_list = list(zip(no_gen_ind, movies['tconst'][no_gen_ind]))
# print(tconst_list)

genres_mapping = get_genres_parallel(tconst_list, debug=True)

[78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 73]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:  1.3min remaining:  6.6min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:  1.6min remaining:  4.9min
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:  1.8min remaining:  3.7min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:  2.1min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:  2.4min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  2.4min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed:  2.6min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  2.8min remaining:   56.2s
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  3.0min remaining:   36.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  3.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapse

In [34]:
# Unpack the genres mapping
index, genres = zip(*genres_mapping)

print(list(genres)[:10])

['Drama,Action,Romance', 'Drama,Comedy,Adventure', 'Drama', 'Comedy,Family,Animation', 'Drama,Action,Comedy', 'Drama,Biography,Thriller', 'Action,Drama,Crime', 'Drama,Comedy,Short', 'Drama,Comedy,Adventure', 'Drama']


In [36]:
# Set the estimated genres in the dataframe and check the values 
movies.loc[index, "genres"] = genres
movies['genres'].iloc[list(index)]

2            Drama,Action,Romance
10         Drama,Comedy,Adventure
18                          Drama
26        Comedy,Family,Animation
33            Drama,Action,Comedy
                   ...           
90450    Documentary,Drama,Action
90494                       Drama
90510      Drama,Comedy,Adventure
90525                       Drama
90531                       Drama
Name: genres, Length: 931, dtype: object

In [37]:
# Check NaN values in the movies df
movies.isnull().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
runtimeMinutes    0
genres            0
averageRating     0
numVotes          0
dtype: int64

In [38]:
movies.to_csv(os.path.join(INTERIM_DIR, 'movies.csv'))

#### We need all the ratings available because the plan is to calculate features for each person using the titles they are known for

In [39]:
title_ratings = pd.read_csv(os.path.join(DATA_DIR, 'title.ratings.tsv'), sep='\t')
title_ratings = process_null_values(title_ratings)
title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1696
1,tt0000002,6.0,210
2,tt0000003,6.5,1448
3,tt0000004,6.1,122
4,tt0000005,6.1,2245
...,...,...,...
1139443,tt9916580,7.7,6
1139444,tt9916690,6.6,5
1139445,tt9916720,6.4,77
1139446,tt9916766,6.9,16


In [40]:
title_ratings.isnull().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [41]:
title_ratings.to_csv(os.path.join(INTERIM_DIR, 'title_ratings.csv'))

In [42]:
title_akas = pd.read_csv(os.path.join(DATA_DIR, 'title.akas.tsv'), sep='\t')
title_akas = process_null_values(title_akas)
title_akas

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0
...,...,...,...,...,...,...,...,...
25897377,tt9916852,4,エピソード #3.20,JP,ja,,,0
25897378,tt9916852,5,Episódio #3.20,PT,pt,,,0
25897379,tt9916852,6,Episodio #3.20,IT,it,,,0
25897380,tt9916852,7,एपिसोड #3.20,IN,hi,,,0


In [43]:
title_akas.isnull().sum()

titleId                   0
ordering                  0
title                     3
region               617749
language            4377488
types              23865156
attributes         25678193
isOriginalTitle        2189
dtype: int64

In [44]:
# Filter the dataframe to include titles we consider and no more
title_akas = title_akas[title_akas['titleId'].isin(considered_titles)][['titleId', 'ordering']]
title_akas

Unnamed: 0,titleId,ordering
28907,tt0011216,1
28908,tt0011216,2
28909,tt0011216,3
28910,tt0011216,4
28911,tt0011216,5
...,...,...
25896725,tt9916428,2
25896726,tt9916428,3
25896770,tt9916460,1
25896771,tt9916460,2


In [45]:
title_akas.to_csv(os.path.join(INTERIM_DIR, 'title.akas.csv'))