In [1]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
from math import floor, ceil
import os

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
INTERIM_DIR = '..\\data\\interim'
PROCESSED_DIR = '..\\data\\processed'

#### Reading all data files

In [4]:
movies_df = pd.read_csv(os.path.join(INTERIM_DIR, 'movies.csv'))
title_ratings_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title_ratings.csv'))
title_akas_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title.akas.csv'))
names_df = pd.read_csv(os.path.join(INTERIM_DIR, 'names.basics.csv'))

### Feature 1: Popularity Index of Actors/Directors/Crew

#### Popularity Index of a Person

**Description:**  
  
  1. For each person in the names_df file, we have a list of titleIDs that they are known for.
  2. Using that list of titleIDs, we get a list of their ratings from the title_ratings file.
  3. We save this list as a list of strings in a column and we calculate multiple features from these numbers.
  4. These will be the **mean, median, and standard deviation** of these values and collectively call them the 'popularity index' of the person.
  
**Intuition:**
  
  1. **Mean** of the list of ratings gives us an idea of how popular the person is. Even if the person generally is average, but they have done one/two movies that perform exceptionally well (according to ratings), the mean will capture that.
  2. The **median** of the list of ratings captures the general consensus of the person's performance that is how they are usually expected to perform.
  3. The **standard deviation** of these numbers will characterize the distribution of the ratings. For Eg. if the mean of the ratings is high, and the standard deviation is low, it means that the person consistently has performed exceptionally well. Similarly if the mean is low but standard deviation is high, it means that the person has been 'average' in their performance sometimes.
  
**Improvement:**
 We could calculate these values over all the titles a person is associated with. However, for the interest of time, we currently only consider the titles that they are known for


In [13]:
names_df

Unnamed: 0.1,Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0050419,tt0031983,tt0053137,tt0072308"
1,1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,music_department","tt0057345,tt0054452,tt0049189,tt0056404"
3,3,nm0000004,John Belushi,1949.0,1982.0,"actor,soundtrack,writer","tt0077975,tt0078723,tt0072562,tt0080455"
4,4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0060827,tt0050976,tt0069467"
...,...,...,...,...,...,...,...
1433317,1433317,nm9993650,Marcin Balcerak,,,actor,tt8739208
1433318,1433318,nm9993680,Christopher-Lawson Palmer,,,actor,"tt10427366,tt10979852,tt8295580"
1433319,1433319,nm9993690,David Jewell,,,,tt7888884
1433320,1433320,nm9993691,Ursula Gehrmann,,,,tt7888884


In [None]:
def calculate_pop_ind_parallel(names_df, title_ratings_df):
    '''
    Calculate the popularity index of a person using titles they are known for
    '''
    name_IDs = names_df['nconst'].values
    
    num_cores = os.cpu_count()
    names_per_core = ceil(len(name_IDs)/num_cores)
    print(names_per_core)
    
    name_list_per_core = [name_IDs[i*names_per_core: (i+1)*names_per_core]
                          for i in range(num_cores)]
    
    output_list = Parallel(n_jobs=-1, verbose=30, max_nbytes=None)(delayed(get_pop_ind)(nameID_list, 
                                                                                        names_df.copy(), 
                                                                                        title_ratings_df.copy())
                                                                  for nameID_list in name_list_per_core)
    
    result = []
    [result.extend(list_) for list_ in output_list if list_!=[]]
    return result

In [None]:
def get_pop_ind(nameID_list, names_df, title_ratings_df):
    '''
    Calculate the popularity index values from the names_df and title ratings 
    '''
    
    pop_inds = []
    
    for nameID in nameID_list:
        
        pop_ind_mean, pop_ind_med, pop_ind_std = 0,0,0
        
        # get the record for nameID from names_df
        knownfor = names_df[names_df['nconst']==nameID]['knownForTitles'].iloc[0]
        
        if not pd.isna(knownfor):
            
            titles = [title for title in knownfor.split(',') if title!='']
            ratings = title_ratings_df[title_ratings_df['tconst'].isin(titles)]['averageRating']
            
            pop_ind_mean = ratings.mean()
            pop_ind_std = ratings.std()
            pop_ind_med = ratings.median()
        
        pop_inds.append((pop_ind_mean, pop_ind_med, pop_ind_std))
    
    return pop_inds

In [None]:
popularity_index = calculate_pop_ind_parallel(names_df, title_ratings_df)