![Duke AIPI Logo](https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png)

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


## Dataset Generation

### Sourcing Data

In [2]:
# Preparing the directory
Path("./.data").mkdir(parents=True, exist_ok=True)

#### IMDB Datasets

In [3]:
def source_imdb(name):
    try:
        return pd.read_csv(f'./.data/{name}.csv')
    except:
        print(f'Local copy not found for {name}, so sourcing from the web...')
        df = pd.read_csv(f'https://datasets.imdbws.com/{name}.tsv.gz', compression='gzip', delimiter='\t')
        df.to_csv(f'./.data/{name}.csv')

name_basics_df = source_imdb('name.basics')
title_basics_df = source_imdb('title.basics')
title_principals_df = source_imdb('title.principals')
title_ratings_df = source_imdb('title.ratings')

  return pd.read_csv(f'./.data/{name}.csv')


In [4]:
name_basics_df.head()

Unnamed: 0.1,Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0050419,tt0072308,tt0053137,tt0027125"
1,1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [5]:
title_basics_df.head()

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
title_principals_df.head()

Unnamed: 0.1,Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,1,tt0000001,2,nm0005690,director,\N,\N
2,2,tt0000001,3,nm0005690,producer,producer,\N
3,3,tt0000001,4,nm0374658,cinematographer,director of photography,\N
4,4,tt0000002,1,nm0721526,director,\N,\N


In [7]:
title_principals_df['category'].unique()

array(['self', 'director', 'producer', 'cinematographer', 'composer',
       'editor', 'actor', 'actress', 'writer', 'production_designer',
       'archive_footage', 'casting_director', 'archive_sound'],
      dtype=object)

In [8]:
title_ratings_df.head()

Unnamed: 0.1,Unnamed: 0,tconst,averageRating,numVotes
0,0,tt0000001,5.7,2104
1,1,tt0000002,5.6,282
2,2,tt0000003,6.5,2123
3,3,tt0000004,5.4,182
4,4,tt0000005,6.2,2852


### Feature Engineering

In [9]:
title_principals_df = pd.merge(title_principals_df[['tconst', 'nconst', 'category']], title_ratings_df[['tconst', 'averageRating']], left_on='tconst', right_on='tconst', how='left')

In [10]:
name_mean_ratings_df = title_principals_df.groupby('nconst').aggregate({'averageRating': 'mean'})
name_mean_ratings_df.head()

Unnamed: 0_level_0,averageRating
nconst,Unnamed: 1_level_1
nm0000001,7.140278
nm0000002,7.217376
nm0000003,6.316964
nm0000004,7.088562
nm0000005,7.109607


In [11]:
name_basics_df = pd.merge(name_basics_df, name_mean_ratings_df, left_on='nconst', right_on='nconst', how='left')
name_basics_df.head()

Unnamed: 0.1,Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,averageRating
0,0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0050419,tt0072308,tt0053137,tt0027125",7.140278
1,1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355",7.217376
2,2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452",6.316964
3,3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723",7.088562
4,4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922",7.109607


In [12]:
principal_ratings_agg_df = title_principals_df.groupby(['nconst', 'category']).aggregate({ 'averageRating': 'mean' })
principal_ratings_agg_df = principal_ratings_agg_df.reset_index()
principal_ratings_agg_df.head()

Unnamed: 0,nconst,category,averageRating
0,nm0000001,actor,6.907692
1,nm0000001,archive_footage,7.063218
2,nm0000001,archive_sound,6.4
3,nm0000001,producer,8.3
4,nm0000001,self,7.472131


In [13]:
principal_ratings_agg_df = principal_ratings_agg_df.pivot(index='nconst', columns='category', values='averageRating').reset_index()
principal_ratings_agg_df.head()

category,nconst,actor,actress,archive_footage,archive_sound,casting_director,cinematographer,composer,director,editor,producer,production_designer,self,writer
0,nm0000001,6.907692,,7.063218,6.4,,,,,,8.3,,7.472131,
1,nm0000002,,6.749333,7.330882,7.5,,,,,,,,7.413768,
2,nm0000003,,5.875,6.48,,,,,,,,,6.75,
3,nm0000004,7.053234,,7.152703,,,,,,,,,7.194737,7.116667
4,nm0000005,7.438462,,6.9375,7.3,,6.45,,7.023944,,6.644444,,7.175,7.225926


In [14]:
principal_ratings_agg_df.columns.name = None
principal_ratings_agg_df.columns = [f'averageRating_{col}' if col != 'nconst' else 'nconst' for col in principal_ratings_agg_df.columns]
principal_ratings_agg_df.head()

Unnamed: 0,nconst,averageRating_actor,averageRating_actress,averageRating_archive_footage,averageRating_archive_sound,averageRating_casting_director,averageRating_cinematographer,averageRating_composer,averageRating_director,averageRating_editor,averageRating_producer,averageRating_production_designer,averageRating_self,averageRating_writer
0,nm0000001,6.907692,,7.063218,6.4,,,,,,8.3,,7.472131,
1,nm0000002,,6.749333,7.330882,7.5,,,,,,,,7.413768,
2,nm0000003,,5.875,6.48,,,,,,,,,6.75,
3,nm0000004,7.053234,,7.152703,,,,,,,,,7.194737,7.116667
4,nm0000005,7.438462,,6.9375,7.3,,6.45,,7.023944,,6.644444,,7.175,7.225926


In [15]:
name_basics_df = pd.merge(name_basics_df, principal_ratings_agg_df, left_on='nconst', right_on='nconst', how='left')
name_basics_df.head()

Unnamed: 0.1,Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,averageRating,averageRating_actor,averageRating_actress,...,averageRating_archive_sound,averageRating_casting_director,averageRating_cinematographer,averageRating_composer,averageRating_director,averageRating_editor,averageRating_producer,averageRating_production_designer,averageRating_self,averageRating_writer
0,0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0050419,tt0072308,tt0053137,tt0027125",7.140278,6.907692,,...,6.4,,,,,,8.3,,7.472131,
1,1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355",7.217376,,6.749333,...,7.5,,,,,,,,7.413768,
2,2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452",6.316964,,5.875,...,,,,,,,,,6.75,
3,3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723",7.088562,7.053234,,...,,,,,,,,,7.194737,7.116667
4,4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922",7.109607,7.438462,,...,7.3,,6.45,,7.023944,,6.644444,,7.175,7.225926


In [16]:
name_basics_df['averageRating_principal'] = name_basics_df['averageRating']
name_basics_df = name_basics_df.drop('averageRating', axis=1)
average_rating_column_names = [col for col in name_basics_df.columns if col.startswith('averageRating')]
average_rating_column_names

['averageRating_actor',
 'averageRating_actress',
 'averageRating_archive_footage',
 'averageRating_archive_sound',
 'averageRating_casting_director',
 'averageRating_cinematographer',
 'averageRating_composer',
 'averageRating_director',
 'averageRating_editor',
 'averageRating_producer',
 'averageRating_production_designer',
 'averageRating_self',
 'averageRating_writer',
 'averageRating_principal']

In [18]:
title_basics_df = pd.merge(title_basics_df, title_ratings_df[['tconst', 'averageRating']], left_on='tconst', right_on='tconst', how='left')
title_basics_df

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating
0,0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7
1,1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.6
2,2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance",6.5
3,3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",5.4
4,4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2
...,...,...,...,...,...,...,...,...,...,...,...
11261930,11261930,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family",
11261931,11261931,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family",
11261932,11261932,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family",
11261933,11261933,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short,


In [None]:
temp_df = title_principals_df[['tconst', 'nconst']]
right_cols = ['nconst']
right_cols.extend(average_rating_column_names)
temp_df = pd.merge(temp_df, name_basics_df[right_cols], left_on='nconst', right_on='nconst', how='left')
agg_cols = ['tconst']
agg_cols.extend(average_rating_column_names)
temp_df = temp_df[agg_cols].groupby('tconst').mean()

TypeError: agg function failed [how->mean,dtype->object]

### Dataset Export