In [1]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
from math import floor, ceil
import os

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
INTERIM_DIR = '..\\data\\interim'
PROCESSED_DIR = '..\\data\\processed'

#### Reading all data files

In [3]:
movies_df = pd.read_csv(os.path.join(INTERIM_DIR, 'movies.csv'))
title_ratings_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title_ratings.csv'))
title_akas_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title.akas.csv'))
names_df = pd.read_csv(os.path.join(INTERIM_DIR, 'names.basics.csv'))

In [4]:
title_principals_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title.principals.csv'))
title_crew_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title.crew.csv'))

### Feature 1: Popularity Index of Actors/Directors/Crew

#### Popularity Index of a Person

**Description:**  
  
  1. For each person in the names_df file, we have a list of titleIDs that they are known for.
  2. Using that list of titleIDs, we get a list of their ratings from the title_ratings file.
  3. We save this list as a list of strings in a column and we calculate multiple features from these numbers.
  4. These will be the **mean, median, and standard deviation** of these values and collectively call them the 'popularity index' of the person.
  
**Assumption:**
  1. While calculating the popularity index as described above, the main assumption is that the ratings associated with the title are a reliable measure of how popular the movie is.
  2. We also are going to infer the popularity of the person from the popularity of the titles they are known for which assumes that the titles that they are known for represent their popularity entirely.

**Intuition:**
  
  1. **Mean** of the list of ratings gives us an idea of how popular the person is. Even if the person generally is average, but they have done one/two movies that perform exceptionally well (according to ratings), the mean will capture that.
  2. The **median** of the list of ratings captures the general consensus of the person's performance that is how they are usually expected to perform.
  3. The **standard deviation** of these numbers will characterize the distribution of the ratings. For Eg. if the mean of the ratings is high, and the standard deviation is low, it means that the person consistently has performed exceptionally well. Similarly if the mean is low but standard deviation is high, it means that the person has been 'average' in their performance sometimes.
  
**Improvement:**
 We could calculate these values over all the titles a person is associated with. However, for the interest of time, we currently only consider the titles that they are known for


In [5]:
names_df

Unnamed: 0.1,Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0050419,tt0031983,tt0053137,tt0072308"
1,1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,music_department","tt0057345,tt0054452,tt0049189,tt0056404"
3,3,nm0000004,John Belushi,1949.0,1982.0,"actor,soundtrack,writer","tt0077975,tt0078723,tt0072562,tt0080455"
4,4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0060827,tt0050976,tt0069467"
...,...,...,...,...,...,...,...
301300,301300,nm9993616,Ryan Mac Lennan,,,actor,tt4844148
301301,301301,nm9993650,Marcin Balcerak,,,actor,tt8739208
301302,301302,nm9993690,David Jewell,,,,tt7888884
301303,301303,nm9993691,Ursula Gehrmann,,,,tt7888884


In [6]:
def calculate_pop_ind_parallel(names_df, title_ratings_df):
    '''
    Calculate the popularity index of a person using titles they are known for
    '''
    name_IDs = names_df['nconst'].tolist()
    
    num_cores = os.cpu_count()
    names_per_core = ceil(len(name_IDs)/num_cores)
    print("names_per_core: {}".format(names_per_core))
    
    name_list_per_core = [name_IDs[i*names_per_core: (i+1)*names_per_core]
                          for i in range(num_cores)]
    
    output_list = Parallel(n_jobs=-1, verbose=30, max_nbytes=None)(delayed(get_pop_ind)(nameID_list, 
                                                                                        names_df.copy(), 
                                                                                        title_ratings_df.copy())
                                                                  for nameID_list in name_list_per_core)
    
    result = []
    [result.extend(list_) for list_ in output_list if list_!=[]]
    return result

In [7]:
def get_pop_ind(nameID_list, names_df, title_ratings_df):
    '''
    Calculate the popularity index values from the names_df and title ratings 
    '''
    
    pop_inds = []
    
    for nameID in nameID_list:
        
        pop_ind_mean, pop_ind_med, pop_ind_std = 0,0,0
        
        # get the record for nameID from names_df
        knownfor = names_df[names_df['nconst']==nameID]['knownForTitles'].iloc[0]
        
        if not pd.isna(knownfor):
            
            titles = [title for title in knownfor.split(',') if title!='']
            ratings = title_ratings_df[title_ratings_df['tconst'].isin(titles)]['averageRating']
            
            pop_ind_mean = ratings.mean()
            pop_ind_std = ratings.std()
            pop_ind_med = ratings.median()
        
        pop_inds.append((pop_ind_mean, pop_ind_med, pop_ind_std))
    
    return pop_inds

In [8]:
popularity_index = calculate_pop_ind_parallel(names_df, title_ratings_df)

names_per_core: 25109


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 39.7min
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed: 44.1min remaining: 220.5min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed: 48.4min remaining: 145.2min
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed: 49.6min remaining: 99.1min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed: 50.1min remaining: 70.1min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed: 50.7min remaining: 50.7min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed: 51.8min remaining: 37.0min
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed: 52.0min remaining: 26.0min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed: 52.3min remaining: 17.4min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed: 52.3min remaining: 10.5min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 52.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elap

In [9]:
popularity_index

[(7.025, 7.0, 0.12583057392117913),
 (7.425, 7.55, 0.55),
 (6.800000000000001, 7.0, 0.9797958971132712),
 (7.300000000000001, 7.7, 1.023067283548187),
 (8.15, 8.149999999999999, 0.05773502691896237),
 (8.024999999999999, 8.0, 0.4031128874149275),
 (8.05, 8.0, 0.3316624790355399),
 (8.174999999999999, 8.25, 0.9105858919765154),
 (7.625, 7.699999999999999, 0.3862210075418824),
 (7.6, 7.65, 0.29439202887759497),
 (7.925, 8.1, 0.5737304826019504),
 (7.75, 7.75, 0.31091263510296036),
 (8.0, 8.0, 0.08164965809277232),
 (7.799999999999999, 7.9, 0.3829708431025351),
 (7.75, 7.65, 0.4358898943540675),
 (4.1000000000000005, 3.8499999999999996, 1.003327796219494),
 (7.825, 7.85, 0.17078251276599354),
 (8.275, 8.3, 0.20615528128088306),
 (8.049999999999999, 8.05, 0.2886751345948129),
 (7.125, 7.25, 0.8057087976847878),
 (8.025, 8.0, 0.14999999999999947),
 (7.574999999999999, 7.55, 0.17078251276599307),
 (8.025, 8.2, 0.42720018726587666),
 (8.225, 8.149999999999999, 0.2629955639676582),
 (7.125, 7.

In [10]:
mean, median, std = zip(*popularity_index)

In [11]:
print(len(mean), len(median), len(std))

301305 301305 301305


#### Setting the values in the Dataframe

In [12]:
names_df['pop_ind_mean'] = list(mean)
names_df['pop_ind_median'] = list(median)
names_df['pop_ind_std'] = list(std)

In [13]:
names_df[['pop_ind_mean', 'pop_ind_median','pop_ind_std']]

Unnamed: 0,pop_ind_mean,pop_ind_median,pop_ind_std
0,7.025,7.00,0.125831
1,7.425,7.55,0.550000
2,6.800,7.00,0.979796
3,7.300,7.70,1.023067
4,8.150,8.15,0.057735
...,...,...,...
301300,6.500,6.50,
301301,5.300,5.30,
301302,6.400,6.40,
301303,6.400,6.40,


#### Let us test how these values behave for extremely popular actors

In [15]:
stars = ['Robert Downey Jr.', 'Anne Hathaway', 'Scarlett Johansson', \
         'Bradley Cooper', 'Chris Hemsworth']

names_df[names_df['primaryName'].isin(stars)][['primaryName','pop_ind_mean', 'pop_ind_median','pop_ind_std']]

Unnamed: 0,primaryName,pop_ind_mean,pop_ind_median,pop_ind_std
306,Robert Downey Jr.,7.65,7.75,0.404145
1914,Anne Hathaway,7.225,7.15,0.974252
13054,Bradley Cooper,7.45,7.45,0.238048
25085,Scarlett Johansson,7.75,7.85,0.331662
73056,Chris Hemsworth,7.25,7.45,0.888819


In [23]:
names_df.to_csv(os.path.join(PROCESSED_DIR, 'names.basics.csv'), index=False)

### Infering the popularity of the movie from the popularity of its cast and crew

In [20]:
def split_nameIDs(nconst_str):
    '''
    Split the nconst string by commas and filter out empty strings
    '''
    return [nameID for nameID in nconst_str.split(',') if nameID!='']

In [39]:
def get_movie_popind(titleID_list, 
                     title_principals=title_principals_df, 
                     title_crew=title_crew_df, 
                     names_df=names_df):
    '''
    Get a feature vector representing the popularity index of movie based
    on the popularity indices of its cast and crew 
    '''
    pop_indexes = []
    
    for titleID in titleID_list:
    
        pi_mean_mean, pi_mean_std = 0,0
        pi_med_mean, pi_med_std = 0,0
        pi_std_mean, pi_std_std = 0,0

        people_involved = set()

        # Adding top 5 actors associated with the movie to the set
        people_involved.update(title_principals[title_principals['tconst']==titleID]['nconst'].values)

        # Adding writers and directors
        crew = title_crew[title_crew['tconst']==titleID]

        if not pd.isna(crew['writers'].iloc[0]):
            people_involved.update(split_nameIDs(crew['writers'].iloc[0]))

        if not pd.isna(crew['directors'].iloc[0]):
            people_involved.update(split_nameIDs(crew['directors'].iloc[0]))

        # Choose the people involved in the names list
        people_involved_df = names_df[names_df['nconst'].isin(people_involved)]

        # Calculate the statistics of the people's popularity indexes
        pi_mean = people_involved_df['pop_ind_mean']
        pi_med = people_involved_df['pop_ind_median']
        pi_std = people_involved_df['pop_ind_std']

        pi_mean_mean, pi_mean_std = pi_mean.mean(), pi_mean.std()
        pi_med_mean, pi_med_std = pi_med.mean(), pi_med.std()
        pi_std_mean, pi_std_std = pi_std.mean(), pi_std.std()
        
        pop_indexes.append((pi_mean_mean, pi_mean_std,\
                           pi_med_mean, pi_med_std,\
                           pi_std_mean, pi_std_std))
    
    return pop_indexes

In [40]:
def calculate__movie_pop_ind_parallel(movies_df,
                                      title_principals=title_principals_df, 
                                      title_crew=title_crew_df, 
                                      names_df=names_df):
    '''
    Calculate the popularity index of a person using titles they are known for
    '''
    title_IDs = movies_df['tconst'].tolist()
    
    num_cores = os.cpu_count()
    titles_per_core = ceil(len(title_IDs)/num_cores)
    print("titles_per_core: {}".format(titles_per_core))
    
    titles_list_per_core = [title_IDs[i*titles_per_core: (i+1)*titles_per_core]
                          for i in range(num_cores)]
    
    output_list = Parallel(n_jobs=-1, verbose=30, max_nbytes=None)(delayed(get_movie_popind)(titleID_list,
                                                                                        title_principals_df.copy(),
                                                                                        title_crew_df.copy(),
                                                                                        names_df.copy()) 
                                                                  for titleID_list in titles_list_per_core)
    
    result = []
    [result.extend(list_) for list_ in output_list if list_!=[]]
    return result

In [41]:
movie_pop_ind = calculate__movie_pop_ind_parallel(movies_df)

titles_per_core: 7547


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed: 12.7min remaining: 63.7min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed: 12.8min remaining: 38.3min
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed: 12.8min remaining: 25.6min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed: 12.8min remaining: 17.9min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed: 12.8min remaining: 12.8min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed: 12.9min remaining:  9.2min
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed: 12.9min remaining:  6.5min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed: 12.9min remaining:  4.3min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed: 12.9min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 13.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapse

In [42]:
pi_mean_mean, pi_mean_std,\
pi_med_mean, pi_med_std,\
pi_std_mean, pi_std_std = zip(*movie_pop_ind)

In [44]:
variables = [pi_mean_mean, pi_mean_std,\
             pi_med_mean, pi_med_std,\
             pi_std_mean, pi_std_std]

names = ['pi_mean_mean', 'pi_mean_std',\
         'pi_med_mean', 'pi_med_std',\
         'pi_std_mean', 'pi_std_std']

pi_dict = list(zip(names, variables))

In [45]:
for (column_name, values) in pi_dict:
    
    movies_df[column_name] = values

movies_df.head()

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,pi_mean_mean,pi_mean_std,pi_med_mean,pi_med_std,pi_std_mean,pi_std_std
0,0,tt0011216,movie,Spanish Fiesta,La fête espagnole,0,2019,67,Drama,6.3,23,7.098611,0.603472,7.058333,0.593647,0.441101,0.240865
1,1,tt0016906,movie,Frivolinas,Frivolinas,0,2014,80,"Comedy,Musical",5.6,15,5.91,0.371484,5.9,0.369121,0.388909,0.05
2,2,tt0019996,movie,Hongxia,Hongxia,0,2011,94,"Drama,Action,Romance",6.3,52,6.85,0.353553,6.9,0.424264,0.264575,
3,3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama,6.8,6446,7.31,0.747621,7.36,0.752828,0.513928,0.171555
4,4,tt0100275,movie,The Wandering Soap Opera,La Telenovela Errante,0,2017,80,"Comedy,Drama,Fantasy",6.6,266,6.882143,0.633889,7.1,0.606905,1.025872,0.640175


In [53]:
na_values = dict(zip(names, [0]*len(names)))
print(na_values)
movies_df = movies_df.fillna(value=na_values)
movies_df.isnull().sum()

{'pi_mean_mean': 0, 'pi_mean_std': 0, 'pi_med_mean': 0, 'pi_med_std': 0, 'pi_std_mean': 0, 'pi_std_std': 0}


Unnamed: 0        0
tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
runtimeMinutes    0
genres            0
averageRating     0
numVotes          0
pi_mean_mean      0
pi_mean_std       0
pi_med_mean       0
pi_med_std        0
pi_std_mean       0
pi_std_std        0
dtype: int64

In [55]:
movies_df.to_csv(os.path.join(PROCESSED_DIR, 'movies.csv'), index=False)

### Feature 2: Number of versions of the movie (different languages and countries)

**Description:**

 1. Using the title.akas file, we can infer how many versions of the movie were released. 
 2. We use the titleID of the movie to query the title.akas table and find out the number of rows associated with the current titleID which gives us the number of versions of the movie
 
**Intuition:**
 1. A higher number of movie versions across languages and countries suggests some level of confidence that the movie will do well since releasing more versions requires more effort and production cost, etc.

In [None]:
# Using group by and aggregating the number of records using the count aggregator
# Number of records in the title_akas file denotes number of versions of the movie

title_akas = title_akas.groupby('titleId').count()
title_akas

In [None]:
# Get the titleIDs of the movies we have 
movie_list = movies_df['tconst'].values

# Set the feature of 'numVersions' as the number of versions as described above
movies_df['numVersions'] = 

In [None]:
def get_num_versions(titleID_list, title_akas_df):
    
    num_versions = []
    
    for titleID in tqdm(titleID_list):
        
        num_versions.append(len(title_akas_df[title_akas_df['titleId']==titleID].values))
    
    return num_versions

In [None]:
def num_versions_parallel(titleID_list, title_akas_df):
    
    num_cores = os.cpu_count()
    movies_per_core = ceil(len(titleID_list)/num_cores)
    
    movie_list_per_core = [titleID_list[i*movies_per_core: (i+1)*movies_per_core]
                          for i in range(num_cores)]  
    
    output_list = Parallel(n_jobs=-1, verbose=30, max_nbytes=None)(delayed(get_num_versions)(titleIDs,
                                                                                             title_akas_df.copy())
                                                                  for nameID_list in name_list_per_core)
    result = []
    [result.extend(list_) for list_ in output_list if list_!=[]]
    return result

In [None]:
movies_df['num_versions'] = get_num_versions_parallel(movies_df['tconst'].values, title_akas_df)
movies_df['num_versions']