# Data preprocessing

# 1st -- Movies dataset

In [None]:
# Libraries imports and function declarations
import pandas as pd
import numpy as np
import json

## Movielens dataset

In [60]:
def convert_genres_list(x):
    if not x:
        return np.NaN
    # If we rearch this point, it means we've got a string
    # with at least 1 genre.
    return x.lower().split("|")

df_main = pd.read_csv(
    "datasets/ml-25m/movies.csv", sep=",",
    converters= {
        'genres': convert_genres_list
    })

In [61]:
df_main.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]"
1,2,Jumanji (1995),"[adventure, children, fantasy]"


In [62]:
df_main.shape

(62423, 3)

In [63]:
df_main.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [64]:
def process_year(movie):
    # Format: movie_name (year)
    # We'll try to get the highest index for the parenthesis.
    # If any of them are not found or the cast has an invalid input return -1 as year.
    try:
        start = movie.rindex('(')
        end = movie.rindex(')')
        
        year = int(movie[start+1:end])
        return year
    except:
        return -1    
    
df_main['year'] = df_main['title'].apply(lambda x: process_year(x))

In [65]:
-1 in df_main['year'] # All years have been successfully converted!

False

In [66]:
# Delete year from name
def process_name(movie):
    # Format: movie_name (year)
    # We'll try to get the highest index for the parenthesis.
    try:
        start = movie.rindex('(')
        new_movie = movie[:start].strip()
        return new_movie
    except:
        return movie

df_main['title'] = df_main['title'].apply(lambda x: process_name(x))

In [67]:
df_main.head(5)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[adventure, animation, children, comedy, fantasy]",1995
1,2,Jumanji,"[adventure, children, fantasy]",1995
2,3,Grumpier Old Men,"[comedy, romance]",1995
3,4,Waiting to Exhale,"[comedy, drama, romance]",1995
4,5,Father of the Bride Part II,[comedy],1995


Once we have this dataset cleaned, we need to aggregate information from the IMDb dataset.

Informatio such as titleType, director, writer and main actors will try to be added.

---
## IMDb dataset

In [68]:
df_im = pd.read_csv(
    "datasets/IMDb/title_basics.tsv", sep="\t",
    usecols= ['tconst', 'titleType', 'primaryTitle']
    )

In [69]:
df_im.shape

(8699991, 3)

In [70]:
df_im.isnull().sum()

tconst          0
titleType       0
primaryTitle    8
dtype: int64

In [71]:
df_im['titleType'].unique()

array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [72]:
# We must drop all columns whose type is not short, movie, tvShort or tvMovie.
# (we're unsure whether Movielense's dataset includes shorts, we'll keep them just in case).
df_im = df_im.loc[df_im['titleType'].isin([
        'short', 'movie', 'tvShort', 'tvMovie'
    ])]

In [73]:
df_im.shape # ~7 million rows dropped.

(1603640, 3)

In [74]:
df_im.isnull().sum()

tconst          0
titleType       0
primaryTitle    0
dtype: int64

In [75]:
def convert_list(x):
    if not x:
        return np.NaN
    if x == '\\N':
        return np.NaN
    return x.split(',')[0]

df_crew = pd.read_csv(
    "datasets/IMDb/title_crew.tsv", sep="\t",
    converters= {
        'directors': convert_list,
        'writers': convert_list
    })

In [76]:
df_crew.head(3)

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,


In [77]:
df_im = pd.merge(df_im, df_crew, how='left', on=['tconst'])

In [78]:
df_im.head(3)

Unnamed: 0,tconst,titleType,primaryTitle,directors,writers
0,tt0000001,short,Carmencita,nm0005690,
1,tt0000002,short,Le clown et ses chiens,nm0721526,
2,tt0000003,short,Pauvre Pierrot,nm0721526,


In [79]:
df_im.shape

(1603640, 5)

In [80]:
df_name = pd.read_csv(
    "datasets/IMDb/name_basics.tsv", sep="\t",
    usecols=[
        'nconst',
        'primaryName'
    ])

In [81]:
df_name.head(3)

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot


In [82]:
df_im = pd.merge(df_im, df_name,
    how='left',
    left_on=['directors'], right_on=['nconst']
    )

# Drop extra column, rename director's column name
df_im = df_im.drop(columns='nconst')
df_im = df_im.rename(columns={'primaryName':'directorName'})

In [83]:
df_im = pd.merge(df_im, df_name,
    how='left',
    left_on=['writers'], right_on=['nconst']
    )

df_im = df_im.drop(columns='nconst')
df_im = df_im.rename(columns={'primaryName':'writerName'})


In [84]:
df_im = df_im.drop(columns=['directors','writers'])
df_im = df_im.rename(columns={
    'directorName':'director',
    'writerName':'writer'
    })


In [85]:
df_im.head(10)

Unnamed: 0,tconst,titleType,primaryTitle,director,writer
0,tt0000001,short,Carmencita,William K.L. Dickson,
1,tt0000002,short,Le clown et ses chiens,Émile Reynaud,
2,tt0000003,short,Pauvre Pierrot,Émile Reynaud,
3,tt0000004,short,Un bon bock,Émile Reynaud,
4,tt0000005,short,Blacksmith Scene,William K.L. Dickson,
5,tt0000006,short,Chinese Opium Den,William K.L. Dickson,
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,William Heise,
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,William K.L. Dickson,
8,tt0000009,short,Miss Jerry,Alexander Black,Alexander Black
9,tt0000010,short,Leaving the Factory,Louis Lumière,


## Merge of the two datasets

In [86]:
df_main.shape[0]

62423

In [87]:
len(set(df_main['title']).intersection(set(df_im['primaryTitle'])))

40916

IMDb's dataset contains information for 40916 of the 62423 movies from the Movielense dataset, or around 65% of the movies.
Since the dataset will be reduced anyway for memory reasons, we can drop the movies with no extra information.

In [88]:
df_main = df_main.merge(df_im.drop_duplicates(subset=['primaryTitle']),
    left_on='title', right_on='primaryTitle',
    how='left')

In [89]:
df_main.shape

(62423, 9)

In [90]:
df_main.isnull().sum()

movieId             0
title               0
genres              0
year                0
tconst          17842
titleType       17842
primaryTitle    17842
director        18455
writer          21563
dtype: int64

In [100]:
# We can delete the rows there the tconst is NaN.
# df_main = df_main[df_main['tconst'].notna()]
df_main = df_main.dropna() # We drop all rows that have any NaN

In [104]:
df_main.columns

Index(['movieId', 'title', 'genres', 'year', 'tconst', 'titleType',
       'primaryTitle', 'director', 'writer'],
      dtype='object')

In [110]:
# Now, delete the useless extra columns and write the dataset to file.
df_main = df_main.drop(['tconst', 'primaryTitle'], axis=1)

In [111]:
df_main.head(3)

Unnamed: 0,movieId,title,genres,year,titleType,director,writer
0,1,Toy Story,"[adventure, animation, children, comedy, fantasy]",1995,movie,John Lasseter,John Lasseter
1,2,Jumanji,"[adventure, children, fantasy]",1995,movie,Joe Johnston,Jonathan Hensleigh
2,3,Grumpier Old Men,"[comedy, romance]",1995,movie,Howard Deutch,Mark Steven Johnson


In [112]:
df_json = df_main.to_json(orient="records")
parsed = json.loads(df_json)

with open("datasets/movies_final.json", "w", encoding="utf-8") as f:
    json.dump(parsed, f, indent=4)