In [0]:
from functools import reduce
import numpy as np
import os.path  # So we can save time recalculating a whole bunch of data.

## Download Datasets

In [0]:
!wget https://datasets.imdbws.com/name.basics.tsv.gz
!wget https://datasets.imdbws.com/title.akas.tsv.gz
!wget https://datasets.imdbws.com/title.basics.tsv.gz
!wget https://datasets.imdbws.com/title.crew.tsv.gz
!wget https://datasets.imdbws.com/title.principals.tsv.gz
!wget https://datasets.imdbws.com/title.ratings.tsv.gz

--2020-03-16 13:48:53--  https://datasets.imdbws.com/name.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 13.226.42.60, 13.226.42.104, 13.226.42.43, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|13.226.42.60|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 216255859 (206M) [binary/octet-stream]
Saving to: ‘name.basics.tsv.gz’


2020-03-16 13:48:56 (81.2 MB/s) - ‘name.basics.tsv.gz’ saved [216255859/216255859]

--2020-03-16 13:48:57--  https://datasets.imdbws.com/title.akas.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 13.35.118.115, 13.35.118.120, 13.35.118.25, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|13.35.118.115|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 179448700 (171M) [binary/octet-stream]
Saving to: ‘title.akas.tsv.gz’


2020-03-16 13:49:00 (65.9 MB/s) - ‘title.akas.tsv.gz’ saved [179448700/179448700]

--2020-03-16 13:49:00--  https://datasets.imdbws.com/

In [0]:
!rm -rf *.tsv
!gzip -d *.gz


In [0]:
!ls -lah

total 4.7G
drwxr-xr-x 1 root root 4.0K Mar 16 13:50 .
drwxr-xr-x 1 root root 4.0K Mar 16 13:44 ..
drwxr-xr-x 1 root root 4.0K Mar  3 18:11 .config
drwx------ 3 root root 4.0K Mar 16 13:48 drive
-rw-r--r-- 1 root root 1.1G Mar 15 13:20 name.basics.tsv
drwxr-xr-x 1 root root 4.0K Mar  3 18:11 sample_data
-rw-r--r-- 1 root root 995M Mar 15 13:21 title.akas.tsv
-rw-r--r-- 1 root root 538M Mar 15 13:21 title.basics.tsv
-rw-r--r-- 1 root root 206M Mar 15 13:20 title.crew.tsv
-rw-r--r-- 1 root root 2.0G Mar 15 13:21 title.principals.tsv
-rw-r--r-- 1 root root  17M Mar 15 13:21 title.ratings.tsv


In [0]:
!sed 's/titleId/tconst/g' title.akas.tsv > title.akas-out.tsv

In [0]:
!mv title.akas-out.tsv title.akas.tsv

In [0]:
print("title.akas")
!head -n 10 title.akas.tsv
print()

print("title.basics")
!head -n 10 title.basics.tsv
print()

print("title.crew")
!head -n 10 title.crew.tsv
print()

print("title.principals")
!head -n 10 title.principals.tsv
print()

print("title.ratings")
!head -n 10 title.ratings.tsv
print()

print("name.basics")
!head -n 10 name.basics.tsv

title.akas
tconst	ordering	title	region	language	types	attributes	isOriginalTitle
tt0000001	1	Carmencita	DE	\N	\N	literal title	0
tt0000001	2	Carmencita - spanyol tánc	HU	\N	imdbDisplay	\N	0
tt0000001	3	Καρμενσίτα	GR	\N	imdbDisplay	\N	0
tt0000001	4	Карменсита	RU	\N	imdbDisplay	\N	0
tt0000001	5	Carmencita	US	\N	\N	\N	0
tt0000001	6	Carmencita	\N	\N	original	\N	1
tt0000001	7	カルメンチータ	JP	ja	imdbDisplay	\N	0
tt0000002	1	Le clown et ses chiens	\N	\N	original	\N	1
tt0000002	2	A bohóc és kutyái	HU	\N	imdbDisplay	\N	0

title.basics
tconst	titleType	primaryTitle	originalTitle	isAdult	startYear	endYear	runtimeMinutes	genres
tt0000001	short	Carmencita	Carmencita	0	1894	\N	1	Documentary,Short
tt0000002	short	Le clown et ses chiens	Le clown et ses chiens	0	1892	\N	5	Animation,Short
tt0000003	short	Pauvre Pierrot	Pauvre Pierrot	0	1892	\N	4	Animation,Comedy,Romance
tt0000004	short	Un bon bock	Un bon bock	0	1892	\N	12	Animation,Short
tt0000005	short	Blacksmith Scene	Blacksmith Scene	0	1893	\N	1	Comedy,S

## Import Data

### Features
For each title (unique `tconst`,) we are interested in the following features:
<!-- - `region` categorical, one per title. -->
<!-- - `language` categorical, one per title. -->
- `startYear` discrete integer.
- `runtimeMinutes` discrete integer.
- `genres` categorical, multiple per title.
- The average rating of the director/directors.
- The average rating of the writer/writers.
- The average rating of the top three actors.

We want to predict:
- `averageRating` continuous between $0$ and $10$.


In [0]:
import pandas as pd
from functools import reduce

# Reading TSV files: https://stackoverflow.com/a/34548894
datasheets = []

# We shall ignore the \N entries in numerical columns, and keep those columns numerical.
title_basics = pd.read_csv("title.basics.tsv", sep='\t', usecols=["tconst", "genres", "runtimeMinutes", "startYear"])
for numerical_column in ["runtimeMinutes", "startYear"]:
  title_basics[numerical_column] = pd.to_numeric(title_basics[numerical_column], errors='coerce')

title_ratings = pd.read_csv("title.ratings.tsv", sep='\t', usecols=["tconst", "averageRating", "numVotes"])

datasheets.append(title_basics)
datasheets.append(pd.read_csv("title.crew.tsv", sep='\t', usecols=["tconst", "directors", "writers"]))
datasheets.append(title_ratings)


  interactivity=interactivity, compiler=compiler, result=result)


### Merge the movie datasheets

In [0]:
title_datasheet = reduce((lambda left, right: pd.merge(left, right, left_on='tconst', right_on='tconst')), datasheets)

In [0]:
display(title_datasheet)

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,directors,writers,averageRating,numVotes
0,tt0000001,1894.0,1.0,"Documentary,Short",nm0005690,\N,5.6,1590
1,tt0000002,1892.0,5.0,"Animation,Short",nm0721526,\N,6.1,192
2,tt0000003,1892.0,4.0,"Animation,Comedy,Romance",nm0721526,\N,6.5,1257
3,tt0000004,1892.0,12.0,"Animation,Short",nm0721526,\N,6.2,119
4,tt0000005,1893.0,1.0,"Comedy,Short",nm0005690,\N,6.1,2018
...,...,...,...,...,...,...,...,...
1038790,tt9916576,2019.0,85.0,Reality-TV,\N,\N,6.4,10
1038791,tt9916578,2019.0,,"Adventure,Biography,Comedy",nm0373673,"nm1485603,nm1485604,nm1866876,nm0909144",8.5,16
1038792,tt9916720,2019.0,10.0,"Comedy,Horror,Mystery",nm10538600,\N,5.5,47
1038793,tt9916766,2019.0,43.0,"Family,Reality-TV",\N,\N,6.7,11


### Merge the name datasheet by rows
Each `tconst` (title) appears multiple times in the `title.principals` datasheet. We *merge* these rows by concatnating the corresponding `nconst` strings (separate each entry with a comma.) We call the output dataframe `principals_merged`.

In [0]:
def concatnate(strings):
  output = str()
  for string in strings:
    output += string + ","
  
  return output

principals = pd.read_csv("title.principals.tsv", sep='\t', usecols=['tconst', 'nconst'])
print("Successfully loaded {} princpal entry/entries.".format(len(principals) + 1))
principals_merged = principals.groupby('tconst').agg(concatnate)
principals_merged = principals_merged.rename(columns={"nconst": "principals"})

Successfully loaded 46675738 princpal entry/entries.


In [0]:
display(principals_merged)

Unnamed: 0_level_0,principals
tconst,Unnamed: 1_level_1
tt0000001,"nm1588970,nm0005690,nm0374658,"
tt0000002,"nm0721526,nm1335271,"
tt0000003,"nm0721526,nm5442194,nm1335271,nm5442200,"
tt0000004,"nm0721526,nm1335271,"
tt0000005,"nm0443482,nm0443482,nm0653042,nm0653042,nm0005..."
...,...
tt9916848,"nm2900202,nm5519557,nm8825009,nm5262613,nm8690..."
tt9916850,"nm2900202,nm5519557,nm8825009,nm5262613,nm8690..."
tt9916852,"nm2900202,nm2900202,nm5519557,nm5519557,nm8825..."
tt9916856,"nm3394271,nm10538650,nm10538646,nm10538647,nm1..."


In [0]:
title_datasheet = pd.merge(title_datasheet, principals_merged, left_on='tconst', right_on='tconst')

## Dim-reduce Names

### Average Rating for Names


Create a dictionary `title_rating_dictionary` of $\verb!tconst! \to (\verb!averageRating! \ast \verb!numVotes!)$

In [0]:
title_rating_dictionary = dict(zip(title_ratings["tconst"], zip(title_ratings["averageRating"], title_ratings["numVotes"])))

Each `name` is known for a list of titles, where they might either be an actor or a directory. 

We calculate the average rating for each `name` and weigh the rating using the number of votes.

In [0]:
# Preprocess actors
def calculate_average(titles):
  if titles == "\\N":
    return 0
  else:
    titles = titles.split(",")
    num_titles = len(titles)
    title_array = np.zeros((num_titles, 2))
    for i in range(num_titles):
      if titles[i][0] == "\\N":
        continue
      
      if titles[i] not in title_rating_dictionary.keys():
        continue

      title_array[i,:] = title_rating_dictionary[titles[i]]
    
    if np.sum(title_array[:,1]) == 0:
      return 0
    
    else:
      return np.average(title_array[:,0], weights=title_array[:,1])

names = pd.read_csv("name.basics.tsv", sep='\t', usecols=["nconst", "knownForTitles"])
print("Successfully loaded {} name(s).".format(len(names) - 1))

names["averageRatings"] = names["knownForTitles"].apply(calculate_average)

# Discard any actor with a rating of zero (i.e., no data.)
names = names[names["averageRatings"] != 0]
print("Successfully generated an averageRatings dataset with {} valid entry/entries.".format(len(names) + 1))

Successfully loaded 18519183 name(s).
Successfully generated an averageRatings dataset with 10669322 valid entry/entries.


### Avg. Rating for Actors/Crews in a title
Create a dictionary `name_rating_dictionary` of $\verb!nconst! \to \verb!averageRatings!$

In [0]:
name_rating_dictionary = dict(zip(names["nconst"], names["averageRatings"]))

For each title, we lookup/calculate the following:
- `averageRating` of the director. 
  - If there is no director, discard this title.
  - If there are multiple directors, take the average of their `averageRating` values.
  - If none of the directors have an `averageRating` value, discard this title.

- `averageRating` of the writer. 
  - If there is no writer, or if the `averageRating` value of the writer is not available, use the `averageRating` of the director/directors instead.

- `averageRating` of the first three actors, one for each actor. 
  - If there is no actor information for this title, use the value for the director/directors instead.
  - If there are fewer than three actors, use the average for the first actors (or first actor and second actor) instead.

The `lookup_calculate_names_in_title` function takes in three comma-separated strings, one for directors, one for writers

In [0]:
def lookup_calculate_names_in_title(directors, writers, principals):
  directors = directors.split(",")
  writers = writers.split(",")
  principals = principals.split(",")

  director_ratings = list()
  for director in directors:
    if director in name_rating_dictionary.keys():
      director_ratings.append(name_rating_dictionary[director])

  # Directors
  # We shall discard this title if there is no directors.
  # In case that none of the directors has rating information, we shall also discard this title.
  if len(director_ratings) == 0:
    return (0, 0, 0)
  else:
    director_average_rating = np.average(director_ratings)

  # Writers
  # If writer information is not available, we shall take the average for the director/directors as a placeholder.
  writer_ratings = list()
  for writer in writers:
    if writer in name_rating_dictionary.keys():
      writer_ratings.append(name_rating_dictionary[writer])

  if len(writer_ratings) == 0:
    writer_average_rating = director_average_rating
  else:
    writer_average_rating = np.average(writer_ratings)

  # Principals
  # If principal information is not available, we shall take the average for the director/directors as a placeholder.
  principal_ratings = list()
  for principal in principals:
    if principal in name_rating_dictionary.keys():
      principal_ratings.append(name_rating_dictionary[principal])

  if len(principal_ratings) == 0:
    principal_average_rating = director_average_rating
  else:
    principal_average_rating = np.average(principal_ratings)
    
  return (director_average_rating, writer_average_rating, principal_average_rating)


averages = \
  title_datasheet.apply(lambda row: 
                        lookup_calculate_names_in_title(str(row["directors"]), str(row["writers"]), str(row["principals"])), 
                        axis=1, result_type='expand')

title_datasheet["directorAverage"], title_datasheet["writerAverage"], title_datasheet['principalAverage'] \
  = averages[0], averages[1], averages[2]

In [0]:
display(averages[0])

0          5.606510
1          6.419985
2          6.419985
3          6.419985
4          5.606510
             ...   
1023081    6.622179
1023082    0.000000
1023083    6.834996
1023084    3.971053
1023085    0.000000
Name: 0, Length: 1023086, dtype: float64

In [0]:
display(title_datasheet)

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,directors,writers,averageRating,numVotes,principals,directorAverage,writerAverage,principalAverage
0,tt0000001,1894.0,1.0,"Documentary,Short",nm0005690,\N,5.6,1590,"nm1588970,nm0005690,nm0374658,",5.606510,5.606510,5.267911
1,tt0000002,1892.0,5.0,"Animation,Short",nm0721526,\N,6.1,192,"nm0721526,nm1335271,",6.419985,6.419985,6.421509
2,tt0000003,1892.0,4.0,"Animation,Comedy,Romance",nm0721526,\N,6.5,1257,"nm0721526,nm5442194,nm1335271,nm5442200,",6.419985,6.419985,6.460755
3,tt0000004,1892.0,12.0,"Animation,Short",nm0721526,\N,6.2,119,"nm0721526,nm1335271,",6.419985,6.419985,6.421509
4,tt0000005,1893.0,1.0,"Comedy,Short",nm0005690,\N,6.1,2018,"nm0443482,nm0443482,nm0653042,nm0653042,nm0005...",5.606510,5.606510,5.365407
...,...,...,...,...,...,...,...,...,...,...,...,...
1023081,tt9916544,2019.0,12.0,"Drama,Short",nm3219235,nm3219235,7.2,15,"nm9424215,nm9873304,nm7395788,nm7571612,nm3219...",6.622179,6.622179,6.427478
1023082,tt9916576,2019.0,85.0,Reality-TV,\N,\N,6.4,10,"nm3939894,nm3282613,nm1700240,nm2277838,nm2353...",0.000000,0.000000,0.000000
1023083,tt9916578,2019.0,,"Adventure,Biography,Comedy",nm0373673,"nm1485603,nm1485604,nm1866876,nm0909144",8.5,16,"nm1876061,nm0104787,nm0006522,nm0358200,nm0372...",6.834996,6.179873,6.932343
1023084,tt9916720,2019.0,10.0,"Comedy,Horror,Mystery",nm10538600,\N,5.5,47,"nm10678584,nm10538601,nm10538600,nm10538603,nm...",3.971053,3.971053,5.653593


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!ls -l /content/drive/'My Drive'/'Colab Notebooks'/Dataset

In [0]:
title_datasheet.to_pickle("/content/drive/My Drive/Colab Notebooks/Dataset/2020-03-16-titles-with-ratings.pkl")