In [1]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
from math import floor, ceil
import os

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
INTERIM_DIR = '..\\data\\interim'
PROCESSED_DIR = '..\\data\\processed'

#### Reading all data files

In [3]:
movies_df = pd.read_csv(os.path.join(INTERIM_DIR, 'movies.csv'))
title_ratings_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title_ratings.csv'))
title_akas_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title.akas.csv'))
names_df = pd.read_csv(os.path.join(INTERIM_DIR, 'names.basics.csv'))

### Feature 1: Popularity Index of Actors/Directors/Crew

#### Popularity Index of a Person

**Description:**  
  
  1. For each person in the names_df file, we have a list of titleIDs that they are known for.
  2. Using that list of titleIDs, we get a list of their ratings from the title_ratings file.
  3. We save this list as a list of strings in a column and we calculate multiple features from these numbers.
  4. These will be the **mean, median, and standard deviation** of these values and collectively call them the 'popularity index' of the person.
  
**Intuition:**
  
  1. **Mean** of the list of ratings gives us an idea of how popular the person is. Even if the person generally is average, but they have done one/two movies that perform exceptionally well (according to ratings), the mean will capture that.
  2. The **median** of the list of ratings captures the general consensus of the person's performance that is how they are usually expected to perform.
  3. The **standard deviation** of these numbers will characterize the distribution of the ratings. For Eg. if the mean of the ratings is high, and the standard deviation is low, it means that the person consistently has performed exceptionally well. Similarly if the mean is low but standard deviation is high, it means that the person has been 'average' in their performance sometimes.
  
**Improvement:**
 We could calculate these values over all the titles a person is associated with. However, for the interest of time, we currently only consider the titles that they are known for


In [4]:
names_df

Unnamed: 0.1,Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0050419,tt0031983,tt0053137,tt0072308"
1,1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,music_department","tt0057345,tt0054452,tt0049189,tt0056404"
3,3,nm0000004,John Belushi,1949.0,1982.0,"actor,soundtrack,writer","tt0077975,tt0078723,tt0072562,tt0080455"
4,4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0060827,tt0050976,tt0069467"
...,...,...,...,...,...,...,...
1433317,1433317,nm9993650,Marcin Balcerak,,,actor,tt8739208
1433318,1433318,nm9993680,Christopher-Lawson Palmer,,,actor,"tt10427366,tt10979852,tt8295580"
1433319,1433319,nm9993690,David Jewell,,,,tt7888884
1433320,1433320,nm9993691,Ursula Gehrmann,,,,tt7888884


In [20]:
names_df[(names_df['deathYear']>2010)|(names_df['deathYear'].isnull())]

Unnamed: 0.1,Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
1,1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,music_department","tt0057345,tt0054452,tt0049189,tt0056404"
12,12,nm0000013,Doris Day,1922.0,2019.0,"soundtrack,actress,producer","tt0045591,tt0053172,tt0049470,tt0048317"
13,13,nm0000014,Olivia de Havilland,1916.0,2020.0,"actress,soundtrack","tt0040806,tt0029843,tt0041452,tt0031381"
17,17,nm0000018,Kirk Douglas,1916.0,2020.0,"actor,producer,soundtrack","tt0043338,tt0054331,tt0049456,tt0043465"
...,...,...,...,...,...,...,...
1433317,1433317,nm9993650,Marcin Balcerak,,,actor,tt8739208
1433318,1433318,nm9993680,Christopher-Lawson Palmer,,,actor,"tt10427366,tt10979852,tt8295580"
1433319,1433319,nm9993690,David Jewell,,,,tt7888884
1433320,1433320,nm9993691,Ursula Gehrmann,,,,tt7888884


In [5]:
def calculate_pop_ind_parallel(names_df, title_ratings_df):
    '''
    Calculate the popularity index of a person using titles they are known for
    '''
    name_IDs = names_df['nconst'].tolist()[:30]
    
    num_cores = os.cpu_count()
    names_per_core = ceil(len(name_IDs)/num_cores)
    print(names_per_core)
    
    name_list_per_core = [name_IDs[i*names_per_core: (i+1)*names_per_core]
                          for i in range(num_cores)]
    
    output_list = Parallel(n_jobs=-1, verbose=30, max_nbytes=None)(delayed(get_pop_ind)(nameID_list, 
                                                                                        names_df.copy(), 
                                                                                        title_ratings_df.copy())
                                                                  for nameID_list in name_list_per_core)
    
    result = []
    [result.extend(list_) for list_ in output_list if list_!=[]]
    return result

In [6]:
def get_pop_ind(nameID_list, names_df, title_ratings_df):
    '''
    Calculate the popularity index values from the names_df and title ratings 
    '''
    
    pop_inds = []
    
    for nameID in nameID_list:
        
        pop_ind_mean, pop_ind_med, pop_ind_std = 0,0,0
        
        # get the record for nameID from names_df
        knownfor = names_df[names_df['nconst']==nameID]['knownForTitles'].iloc[0]
        
        if not pd.isna(knownfor):
            
            titles = [title for title in knownfor.split(',') if title!='']
            ratings = title_ratings_df[title_ratings_df['tconst'].isin(titles)]['averageRating']
            
            pop_ind_mean = ratings.mean()
            pop_ind_std = ratings.std()
            pop_ind_med = ratings.median()
        
        pop_inds.append((pop_ind_mean, pop_ind_med, pop_ind_std))
    
    return pop_inds

In [7]:
popularity_index = calculate_pop_ind_parallel(names_df, title_ratings_df)

3


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:   10.5s remaining:   31.7s
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:   10.6s remaining:   21.2s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:   13.8s remaining:   19.3s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:   15.8s remaining:   15.8s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:   17.5s remaining:   12.5s
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed:   18.2s remaining:    9.0s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:   20.1s remaining:    6.6s
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:   23.3s remaining:    4.6s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   25.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   25.2s finished

In [8]:
popularity_index

[(7.025, 7.0, 0.12583057392117913),
 (7.425, 7.55, 0.55),
 (6.800000000000001, 7.0, 0.9797958971132712),
 (7.300000000000001, 7.7, 1.023067283548187),
 (8.15, 8.149999999999999, 0.05773502691896237),
 (8.024999999999999, 8.0, 0.4031128874149275),
 (8.05, 8.0, 0.3316624790355399),
 (8.174999999999999, 8.25, 0.9105858919765154),
 (7.625, 7.699999999999999, 0.3862210075418824),
 (7.925000000000001, 7.95, 0.17078251276599307),
 (7.6, 7.800000000000001, 0.4760952285695233),
 (7.925000000000001, 8.0, 0.3095695936834449),
 (7.299999999999999, 7.35, 0.14142135623730984),
 (7.925000000000001, 8.0, 0.2362907813126304),
 (7.6, 7.65, 0.29439202887759497),
 (6.949999999999999, 6.85, 1.0598742063723097),
 (7.925, 8.1, 0.5737304826019504),
 (7.75, 7.75, 0.31091263510296036),
 (8.0, 8.0, 0.08164965809277232),
 (8.0, 7.8, 0.7118052168020874),
 (7.575, 7.7, 0.6020797289396147),
 (7.799999999999999, 7.9, 0.3829708431025351),
 (7.825, 7.8, 0.33040379335998327),
 (7.3, 7.3, 0.3265986323710903),
 (7.0249999

In [9]:
mean, median, std = zip(*popularity_index)

### Infering the popularity of the movie from the popularity of its crew

In [11]:
# find a way to aggregate the popularity index of the cast and crew to produce popularity value(s) for the movie

### Feature 2: Number of versions of the movie (different languages and countries)

**Description:**

 1. Using the title.akas file, we can infer how many versions of the movie were released. 
 2. We use the titleID of the movie to query the title.akas table and find out the number of rows associated with the current titleID which gives us the number of versions of the movie
 
**Intuition:**
 1. A higher number of movie versions across languages and countries suggests some level of confidence that the movie will do well since releasing more versions requires more effort and production cost, etc.

In [16]:
def get_num_versions(titleID_list, title_akas_df):
    
    num_versions = []
    
    for titleID in tqdm(titleID_list):
        
        num_versions.append(len(title_akas_df[title_akas_df['titleId']==titleID].values))
    
    return num_versions

In [None]:
def num_versions_parallel(titleID_list, title_akas_df):
    
    

In [17]:
movies_df['num_versions'] = get_num_versions(movies_df['tconst'].values, title_akas_df)
movies_df['num_versions']

  8%|█████▎                                                                | 42797/557381 [1:17:07<15:08:43,  9.44it/s]

KeyboardInterrupt: 