In [1]:
import requests

import pandas as pd
import pickle as pkl
import numpy as np

from collections import Counter
import re
import os
import json

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


For this challenge we will need the following files:

1. https://datasets.imdbws.com/title.basics.tsv.gz 
<!-- 2. https://datasets.imdbws.com/title.ratings.tsv.gz -->
3. IMDB_movie_details.json from https://www.kaggle.com/rmisra/imdb-spoiler-dataset/data (requires authorization; already downloaded in 'data' folder)
4. http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz

In [2]:
data_dir = 'data' # folder containing original datasets
tmp_dir = 'tmp' # temporal folder containing intermediate results of Jupyter notebooks

if not os.path.exists(tmp_dir):
    os.mkdir(tmp_dir)

In [3]:
def download_file(url, path):
        
    file_name = f"{path}/{url.split('/')[-1]}"
    
    if os.path.exists(file_name):
        return file_name 
    
    resp = requests.get(url)

    with open(file_name, 'wb') as f: 
        f.write(resp.content)
    
    if file_name[-7:] == '.tar.gz':
        os.system(f'tar -xf {file_name} -C {path}')    
        
    return file_name     

In [4]:
%%time

urls = 'https://datasets.imdbws.com/title.basics.tsv.gz', 'http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz'

[download_file(url, data_dir) for url in urls]

CPU times: user 24 µs, sys: 14 µs, total: 38 µs
Wall time: 39.6 µs


['data/title.basics.tsv.gz', 'data/MovieSummaries.tar.gz']

### Downloading plot summaries

In [5]:
plot_summaries = pd.read_csv(f'{data_dir}/MovieSummaries/plot_summaries.txt', 
                             sep='\t', 
                             names=['wiki_id', 'synopsis'], 
                             index_col='wiki_id')
len(plot_summaries)

42303

### Downloading movies metadata

In [6]:
col_names = ['wiki_id', 'freebase_id', 'name', 'release date', 
             'revenue', 'runtime', 'languages', 'countries', 'genres']

movie_metadata = pd.read_csv('data/MovieSummaries/movie.metadata.tsv', 
                             sep='\t', 
                             names=col_names,
                             index_col='wiki_id')

In [7]:
no_metadata_ids = list(set(plot_summaries.index).difference(set(movie_metadata.index)))
len(no_metadata_ids)

99

### Merging plot summaries and metadata

In [8]:
plot_data = pd.merge(plot_summaries, movie_metadata, 
                     left_index=True, right_index=True)

plot_data['genres'] = plot_data['genres'].apply(lambda x : list(eval(x).values()))

no_genre_ids = plot_data['genres'][plot_data['genres'].apply(len) == 0].index.to_list()

plot_data = plot_data.drop(no_genre_ids)

len(no_genre_ids)

411

In [9]:
_ = ~plot_data.duplicated(subset=['synopsis'])

plot_data = plot_data[_]
plot_data

Unnamed: 0_level_0,synopsis,freebase_id,name,release date,revenue,runtime,languages,countries,genres
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,Taxi Blues,1990-09-07,,110.0,"{""/m/06b_j"": ""Russian Language""}","{""/m/0f8l9c"": ""France"", ""/m/05vz3zq"": ""Soviet ...","[Drama, World cinema]"
31186339,The nation of Panem consists of a wealthy Capi...,/m/0gkz15s,The Hunger Games,2012-03-12,686533290.0,142.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Action/Adventure, Science Fiction, Action, Dr..."
20663735,Poovalli Induchoodan is sentenced for six yea...,/m/051zjwb,Narasimham,2000,,175.0,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","[Musical, Action, Drama, Bollywood]"
2231378,"The Lemon Drop Kid , a New York City swindler,...",/m/06xtz3,The Lemon Drop Kid,1951-03-08,2300000.0,91.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Screwball comedy, Comedy]"
595909,Seventh-day Adventist Church pastor Michael Ch...,/m/02tqm5,A Cry in the Dark,1988-11-03,6908797.0,121.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","[Crime Fiction, Drama, Docudrama, World cinema..."
...,...,...,...,...,...,...,...,...,...
34808485,"The story is about Reema , a young Muslim scho...",/m/0j3dcl6,Oomakkuyil Padumbol,2012-02-17,,,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}",[Children's]
1096473,"In 1928 Hollywood, director Leo Andreyev look...",/m/045pct,The Last Command,1928,,88.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","[Silent film, Indie, Black-and-white, Period p..."
35102018,American Luthier focuses on Randy Parsons’ tra...,/m/0j6777g,Randy Parsons: American Luthier,2011-10-04,,8.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Short Film, Music, Biographical film, Documen..."
8628195,"Abdur Rehman Khan , a middle-aged dry fruit se...",/m/04f7jfs,Kabuliwala,1961-12-14,,134.0,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}",[Drama]


In [10]:
genres_data = plot_data['genres'].to_list()
genres_data[:10]

[['Drama', 'World cinema'],
 ['Action/Adventure', 'Science Fiction', 'Action', 'Drama'],
 ['Musical', 'Action', 'Drama', 'Bollywood'],
 ['Screwball comedy', 'Comedy'],
 ['Crime Fiction', 'Drama', 'Docudrama', 'World cinema', 'Courtroom Drama'],
 ['Thriller', 'Action/Adventure', 'Action', 'Drama'],
 ['Thriller', 'Drama', 'Horror'],
 ['Drama', 'Teen'],
 ['Romantic comedy',
  'Media Satire',
  'Americana',
  'Comedy-drama',
  'Black-and-white',
  'Drama',
  'Comedy',
  'Romance Film'],
 ['Short Film', 'Family Film', 'Animation']]

### The dataset contains too many genres. Let us look at the genres used in imdb dataset

In [11]:
%time title_basics = pd.read_csv('data/title.basics.tsv.gz', sep='\t', low_memory=False)
genres_imdb = title_basics['genres']
genres_imdb

CPU times: user 15.3 s, sys: 1.61 s, total: 16.9 s
Wall time: 16.9 s


0                   Documentary,Short
1                     Animation,Short
2            Animation,Comedy,Romance
3                     Animation,Short
4                        Comedy,Short
                      ...            
9553993           Action,Drama,Family
9553994           Action,Drama,Family
9553995           Action,Drama,Family
9553996                         Short
9553997    Adventure,Animation,Comedy
Name: genres, Length: 9553998, dtype: object

In [12]:
genres_imdb = genres_imdb.dropna().to_list()
genres_imdb = [list(set(_.split(','))) for _ in genres_imdb]

In [13]:
tally = lambda x : sorted(Counter(x).items(), key=lambda _: _[1])[::-1]

In [14]:
genres_imdb_unique_tally = tally(np.concatenate(genres_imdb))

### Also let us look at the genres used in imdb spoiler dataset

In [15]:
def read_json_file(file):
    '''
    from https://stackoverflow.com/a/68942058/5242222
    '''
    with open(file, "r") as r:
        response = r.read()
        response = response.replace('\n', '')
        response = response.replace('}{', '},{')
        response = "[" + response + "]"
        return json.loads(response)

In [16]:
fn = f'{data_dir}/IMDB_movie_details.json'
IMDB_movie_details = read_json_file(fn) #json.load(open(imdb_plots_file, 'rt'))

In [17]:
_ = [_['genre'] for _ in IMDB_movie_details]

IMDB_movie_details_unique_tally = tally(np.concatenate(_))

### Let remove the genres not contained in spoiler dataset

In [18]:
set(np.transpose(genres_imdb_unique_tally)[0]).symmetric_difference(set(np.transpose(IMDB_movie_details_unique_tally)[0]))

{'Adult',
 'Documentary',
 'Experimental',
 'Game-Show',
 'News',
 'Reality-TV',
 'Short',
 'Talk-Show',
 '\\N'}

In [19]:
genres = np.transpose(IMDB_movie_details_unique_tally)[0].tolist()
genres.sort()

keys = [_.lower() for _ in genres]
genres = dict(zip(genres, keys))
genres['Film-Noir'] = 'noir'
genres['Sci-Fi'] = 'scifi'
genres = {v:k for k, v in genres.items()}

genres, len(genres)

({'action': 'Action',
  'adventure': 'Adventure',
  'animation': 'Animation',
  'biography': 'Biography',
  'comedy': 'Comedy',
  'crime': 'Crime',
  'drama': 'Drama',
  'family': 'Family',
  'fantasy': 'Fantasy',
  'noir': 'Film-Noir',
  'history': 'History',
  'horror': 'Horror',
  'music': 'Music',
  'musical': 'Musical',
  'mystery': 'Mystery',
  'romance': 'Romance',
  'scifi': 'Sci-Fi',
  'sport': 'Sport',
  'thriller': 'Thriller',
  'war': 'War',
  'western': 'Western'},
 21)

### Now we need to replace the genres in our dataset by 21 genres that we found. We can semiautomatize the process, however, a manual inspection and correction of the labels is done (example of such correction is 'Science fiction' that needed to be replaced by 'sci-fi')

In [20]:
genres_data_unique = set(np.concatenate(genres_data))
genres_data_unique = sorted([_.lower() for _ in genres_data_unique])

data2imdb = {_:','.join(sorted(set(re.findall(r'[\w]+', _)).intersection(set(genres.keys())))) for _ in genres_data_unique}

data2imdb

{'absurdism': '',
 'acid western': 'western',
 'action': 'action',
 'action comedy': 'action,comedy',
 'action thrillers': 'action',
 'action/adventure': 'action,adventure',
 'addiction drama': 'drama',
 'adult': '',
 'adventure': 'adventure',
 'adventure comedy': 'adventure,comedy',
 'airplanes and airports': '',
 'albino bias': '',
 'alien film': '',
 'alien invasion': '',
 'americana': '',
 'animal picture': '',
 'animals': '',
 'animated cartoon': '',
 'animated musical': 'musical',
 'animation': 'animation',
 'anime': '',
 'anthology': '',
 'anthropology': '',
 'anti-war': 'war',
 'anti-war film': 'war',
 'apocalyptic and post-apocalyptic fiction': '',
 'archaeology': '',
 'archives and records': '',
 'art film': '',
 'auto racing': '',
 'avant-garde': '',
 'b-movie': '',
 'b-western': 'western',
 'backstage musical': 'musical',
 'baseball': '',
 'beach film': '',
 'beach party film': '',
 'bengali cinema': '',
 'biker film': '',
 'biographical film': '',
 'biography': 'biography'

In [21]:
data2imdb_mod = {'absurdism': '',
                 'acid western': 'western',
                 'action': 'action',
                 'action comedy': 'action,comedy',
                 'action thrillers': 'action,thriller', #
                 'action/adventure': 'action,adventure',
                 'addiction drama': 'drama',
                 'adult': '',
                 'adventure': 'adventure',
                 'adventure comedy': 'adventure,comedy',
                 'airplanes and airports': '',
                 'albino bias': '',
                 'alien film': 'scifi', #
                 'alien invasion': 'scifi', #
                 'americana': '',
                 'animal picture': '',
                 'animals': '',
                 'animated cartoon': 'animation', #
                 'animated musical': 'animation,musical', #
                 'animation': 'animation',
                 'anime': 'animation', #
                 'anthology': '',
                 'anthropology': '',
                 'anti-war': 'war',
                 'anti-war film': 'war',
                 'apocalyptic and post-apocalyptic fiction': '',
                 'archaeology': '',
                 'archives and records': '',
                 'art film': '',
                 'auto racing': '',
                 'avant-garde': '',
                 'b-movie': '',
                 'b-western': 'western',
                 'backstage musical': 'musical',
                 'baseball': '',
                 'beach film': '',
                 'beach party film': '',
                 'bengali cinema': '',
                 'biker film': '',
                 'biographical film': 'biography', #
                 'biography': 'biography',
                 'biopic [feature]': 'biography', #
                 'black comedy': 'comedy',
                 'black-and-white': '',
                 'blaxploitation': '',
                 'bloopers & candid camera': '',
                 'bollywood': '',
                 'boxing': '',
                 'breakdance': '',
                 'british empire film': '',
                 'british new wave': '',
                 'bruceploitation': '',
                 'buddy cop': '',
                 'buddy film': '',
                 'buddy picture': '',
                 'business': '',
                 'c-movie': '',
                 'camp': '',
                 'caper story': '',
                 'cavalry film': '',
                 'chase movie': '',
                 'chick flick': '',
                 'childhood drama': 'drama',
                 "children's": '',
                 "children's entertainment": '',
                 "children's fantasy": 'fantasy',
                 "children's issues": '',
                 "children's/family": 'family',
                 'chinese movies': '',
                 'christian film': '',
                 'christmas movie': '',
                 'clay animation': 'animation',
                 'cold war': 'war',
                 'combat films': '',
                 'comdedy': 'comedy', #
                 'comedy': 'comedy',
                 'comedy film': 'comedy',
                 'comedy horror': 'comedy,horror',
                 'comedy of errors': 'comedy',
                 'comedy of manners': 'comedy',
                 'comedy thriller': 'comedy,thriller',
                 'comedy western': 'comedy,western',
                 'comedy-drama': 'comedy,drama',
                 'coming of age': '',
                 'coming-of-age film': '',
                 'computer animation': 'animation',
                 'computers': '',
                 'concert film': '',
                 'conspiracy fiction': '',
                 'costume adventure': 'adventure',
                 'costume drama': 'drama',
                 'costume horror': 'horror',
                 'courtroom comedy': 'comedy',
                 'courtroom drama': 'drama',
                 'creature film': '',
                 'crime': 'crime',
                 'crime comedy': 'comedy,crime',
                 'crime drama': 'crime,drama',
                 'crime fiction': 'crime',
                 'crime thriller': 'crime,thriller',
                 'cult': '',
                 'culture & society': '',
                 'cyberpunk': '',
                 'czechoslovak new wave': '',
                 'dance': '',
                 'demonic child': '',
                 'detective': '',
                 'detective fiction': '',
                 'disaster': '',
                 'docudrama': '',
                 'documentary': '',
                 'dogme 95': '',
                 'domestic comedy': 'comedy',
                 'doomsday film': '',
                 'drama': 'drama',
                 'dystopia': '',
                 'ealing comedies': 'comedy', #
                 'early black cinema': '',
                 'education': '',
                 'educational': '',
                 'ensemble film': '',
                 'environmental science': '',
                 'epic': '',
                 'epic western': 'western',
                 'erotic drama': 'drama',
                 'erotic thriller': 'thriller',
                 'erotica': '',
                 'escape film': '',
                 'essay film': '',
                 'existentialism': '',
                 'experimental film': '',
                 'exploitation': '',
                 'expressionism': '',
                 'extreme sports': 'sport', #
                 'fairy tale': '',
                 'family & personal relationships': 'family',
                 'family drama': 'drama,family',
                 'family film': 'family',
                 'family-oriented adventure': 'adventure,family',
                 'fan film': '',
                 'fantasy': 'fantasy',
                 'fantasy adventure': 'adventure,fantasy',
                 'fantasy comedy': 'comedy,fantasy',
                 'fantasy drama': 'drama,fantasy',
                 'feature film': '',
                 'female buddy film': '',
                 'feminist film': '',
                 'fictional film': '',
                 'filipino': '',
                 'filipino movies': '',
                 'film': '',
                 'film & television history': 'history',
                 'film adaptation': '',
                 'film noir': 'noir',
                 'film à clef': '',
                 'film-opera': '',
                 'filmed play': '',
                 'finance & investing': '',
                 'foreign legion': '',
                 'future noir': 'noir',
                 'gangster film': '',
                 'gay': '',
                 'gay interest': '',
                 'gay pornography': '',
                 'gay themed': '',
                 'gender issues': '',
                 'giallo': '',
                 'glamorized spy film': '',
                 'goat gland': '',
                 'gothic film': '',
                 'graphic & applied arts': '',
                 'gross out': '',
                 'gross-out film': '',
                 'gulf war': 'war',
                 'hagiography': '',
                 'hardcore pornography': '',
                 'haunted house film': '',
                 'health & fitness': '',
                 'heaven-can-wait fantasies': '',
                 'heavenly comedy': 'comedy',
                 'heist': '',
                 'hip hop movies': '', 
                 'historical documentaries': 'history', #
                 'historical drama': 'drama,history', #
                 'historical epic': 'history', #
                 'historical fiction': 'history', #
                 'history': 'history',
                 'holiday film': '',
                 'homoeroticism': '',
                 'horror': 'horror',
                 'horror comedy': 'comedy,horror',
                 'horse racing': '',
                 'humour': '',
                 'hybrid western': 'western',
                 'illnesses & disabilities': '',
                 'indian western': 'western',
                 'indie': '',
                 'inspirational drama': 'drama',
                 'instrumental music': 'music',
                 'interpersonal relationships': '',
                 'inventions & innovations': '',
                 'japanese movies': '',
                 'journalism': '',
                 'jukebox musical': 'musical',
                 'jungle film': '',
                 'juvenile delinquency film': '',
                 'kafkaesque': '',
                 'kitchen sink realism': '',
                 'language & literature': '',
                 'latino': '',
                 'law & crime': 'crime',
                 'legal drama': 'drama',
                 'lgbt': '',
                 'libraries and librarians': '',
                 'linguistics': '',
                 'live action': 'action',
                 'malayalam cinema': '',
                 'marriage drama': 'drama',
                 'martial arts film': '',
                 'master criminal films': '',
                 'media satire': '',
                 'media studies': '',
                 'medical fiction': '',
                 'melodrama': '', 
                 'mockumentary': '',
                 'mondo film': '',
                 'monster': '',
                 'monster movie': '',
                 'movie serial': '',
                 'movies about gladiators': '',
                 'mumblecore': '',
                 'music': 'music',
                 'musical': 'musical',
                 'musical comedy': 'comedy,musical',
                 'musical drama': 'drama,musical',
                 'mystery': 'mystery',
                 'mythological fantasy': 'fantasy',
                 'natural disaster': '',
                 'natural horror films': 'horror',
                 'nature': '',
                 'neo-noir': 'noir',
                 'neorealism': '',
                 'new hollywood': '',
                 'new queer cinema': '',
                 'news': '',
                 'ninja movie': '',
                 'northern': '',
                 'nuclear warfare': '',
                 'operetta': '',
                 'outlaw': '',
                 'outlaw biker film': '',
                 'parkour in popular culture': '',
                 'parody': '',
                 'patriotic film': '',
                 'period horror': 'horror',
                 'period piece': '',
                 'pinku eiga': '',
                 'plague': '',
                 'point of view shot': '',
                 'political cinema': '',
                 'political documetary': '',
                 'political drama': 'drama',
                 'political satire': '',
                 'political thriller': 'thriller',
                 'pornographic movie': '',
                 'pornography': '',
                 'pre-code': '',
                 'prison': '',
                 'prison escape': '',
                 'prison film': '',
                 'private military company': '',
                 'propaganda film': '',
                 'psycho-biddy': '',
                 'psychological horror': 'horror',
                 'psychological thriller': 'thriller',
                 'punk rock': '',
                 'race movie': '',
                 'reboot': '',
                 'religious film': '',
                 'remake': '',
                 'revenge': '',
                 'revisionist fairy tale': '',
                 'revisionist western': 'western',
                 'road movie': '',
                 'road-horror': 'horror',
                 'roadshow theatrical release': '',
                 'roadshow/carny': '',
                 'rockumentary': '',
                 'romance film': 'romance',
                 'romantic comedy': 'comedy,romance', #
                 'romantic drama': 'drama,romance', #
                 'romantic fantasy': 'fantasy,romance', #
                 'romantic thriller': 'thriller,romance', #
                 'samurai cinema': '',
                 'satire': '',
                 'school story': '',
                 'sci fi pictures original films': 'scifi', #
                 'sci-fi adventure': 'adventure,scifi', #
                 'sci-fi horror': 'horror,scifi', #
                 'sci-fi thriller': 'scifi,thriller', #
                 'science fiction': 'scifi', #
                 'science fiction western': 'scifi,western', #
                 'screwball comedy': 'comedy',
                 'sex comedy': 'comedy',
                 'sexploitation': '',
                 'short film': '',
                 'silent film': '',
                 'silhouette animation': 'animation',
                 'singing cowboy': '',
                 'slapstick': 'comedy', #
                 'slasher': '',
                 'slice of life story': '',
                 'social issues': '',
                 'social problem film': '',
                 'softcore porn': '',
                 'space opera': '',
                 'space western': 'western',
                 'spaghetti western': 'western',
                 'splatter film': '',
                 'sponsored film': '',
                 'sports': 'sport', #
                 'spy': '',
                 'stand-up comedy': 'comedy',
                 'star vehicle': '',
                 'statutory rape': '',
                 'steampunk': '',
                 'stoner film': '',
                 'stop motion': '',
                 'superhero': '',
                 'superhero movie': '',
                 'supermarionation': '',
                 'supernatural': '',
                 'surrealism': '',
                 'suspense': '',
                 'swashbuckler films': '',
                 'sword and sandal': '',
                 'sword and sorcery': '',
                 'sword and sorcery films': '',
                 'tamil cinema': '',
                 'teen': '',
                 'television movie': '',
                 'the netherlands in world war ii': 'war',
                 'therimin music': 'music',
                 'thriller': 'thriller',
                 'time travel': 'scifi', #
                 'tokusatsu': '',
                 'tollywood': '',
                 'tragedy': 'drama', #
                 'tragicomedy': 'comedy,drama', #
                 'travel': '',
                 'vampire movies': '',
                 'war effort': 'war',
                 'war film': 'war',
                 'werewolf fiction': '',
                 'western': 'western',
                 'whodunit': '',
                 'women in prison films': '',
                 'workplace comedy': 'comedy',
                 'world cinema': '',
                 'world history': 'history',
                 'wuxia': '',
                 'z movie': '',
                 'zombie film': ''
                }

In [22]:
genres_data2 = []

for d in genres_data:
    
    d2 = [data2imdb_mod[_.lower()] for _ in d]
    
    d2 = sorted(set(','.join(d2).split(',')))
    
    d2 = [_ for _ in d2 if _!='']
    
    genres_data2.append(d2)


genres_data2

[['drama'],
 ['action', 'adventure', 'drama', 'scifi'],
 ['action', 'drama', 'musical'],
 ['comedy'],
 ['crime', 'drama'],
 ['action', 'adventure', 'drama', 'thriller'],
 ['drama', 'horror', 'thriller'],
 ['drama'],
 ['comedy', 'drama', 'romance'],
 ['animation', 'family'],
 ['comedy'],
 ['comedy', 'crime', 'drama'],
 ['comedy'],
 ['action', 'adventure', 'drama', 'war'],
 ['comedy'],
 ['horror'],
 ['comedy', 'crime', 'horror', 'mystery', 'thriller'],
 ['drama'],
 ['action', 'crime', 'romance', 'thriller'],
 ['drama'],
 ['drama', 'war'],
 ['animation', 'family'],
 ['drama'],
 ['action', 'adventure', 'drama'],
 ['drama'],
 ['drama'],
 ['comedy'],
 ['comedy', 'romance'],
 ['comedy', 'musical', 'romance'],
 ['action', 'adventure', 'drama', 'history', 'romance'],
 ['action', 'adventure', 'romance', 'thriller'],
 ['biography', 'drama'],
 ['drama', 'family'],
 ['horror', 'mystery'],
 ['action', 'adventure', 'drama', 'western'],
 ['comedy', 'drama'],
 ['animation', 'comedy', 'family'],
 ['dram

In [23]:
plot_data['year'] = [str(_).split('-')[0] for _ in plot_data['release date']]

plot_data

Unnamed: 0_level_0,synopsis,freebase_id,name,release date,revenue,runtime,languages,countries,genres,year
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,Taxi Blues,1990-09-07,,110.0,"{""/m/06b_j"": ""Russian Language""}","{""/m/0f8l9c"": ""France"", ""/m/05vz3zq"": ""Soviet ...","[Drama, World cinema]",1990
31186339,The nation of Panem consists of a wealthy Capi...,/m/0gkz15s,The Hunger Games,2012-03-12,686533290.0,142.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Action/Adventure, Science Fiction, Action, Dr...",2012
20663735,Poovalli Induchoodan is sentenced for six yea...,/m/051zjwb,Narasimham,2000,,175.0,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","[Musical, Action, Drama, Bollywood]",2000
2231378,"The Lemon Drop Kid , a New York City swindler,...",/m/06xtz3,The Lemon Drop Kid,1951-03-08,2300000.0,91.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Screwball comedy, Comedy]",1951
595909,Seventh-day Adventist Church pastor Michael Ch...,/m/02tqm5,A Cry in the Dark,1988-11-03,6908797.0,121.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","[Crime Fiction, Drama, Docudrama, World cinema...",1988
...,...,...,...,...,...,...,...,...,...,...
34808485,"The story is about Reema , a young Muslim scho...",/m/0j3dcl6,Oomakkuyil Padumbol,2012-02-17,,,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}",[Children's],2012
1096473,"In 1928 Hollywood, director Leo Andreyev look...",/m/045pct,The Last Command,1928,,88.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","[Silent film, Indie, Black-and-white, Period p...",1928
35102018,American Luthier focuses on Randy Parsons’ tra...,/m/0j6777g,Randy Parsons: American Luthier,2011-10-04,,8.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Short Film, Music, Biographical film, Documen...",2011
8628195,"Abdur Rehman Khan , a middle-aged dry fruit se...",/m/04f7jfs,Kabuliwala,1961-12-14,,134.0,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}",[Drama],1961


In [24]:
lens = np.array([len(_) for _ in genres_data2])

plot_data['genres'] = genres_data2

idx = np.where(lens!=0)[0].tolist()

plot_data = plot_data.iloc[idx]

In [25]:
plot_data['genres'] = [[genres[_] for _ in _] for _ in plot_data['genres']]

plot_data = plot_data[['name', 'year', 'revenue', 'genres', 'synopsis']]

plot_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plot_data['genres'] = [[genres[_] for _ in _] for _ in plot_data['genres']]


Unnamed: 0_level_0,name,year,revenue,genres,synopsis
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
23890098,Taxi Blues,1990,,[Drama],"Shlykov, a hard-working taxi driver and Lyosha..."
31186339,The Hunger Games,2012,686533290.0,"[Action, Adventure, Drama, Sci-Fi]",The nation of Panem consists of a wealthy Capi...
20663735,Narasimham,2000,,"[Action, Drama, Musical]",Poovalli Induchoodan is sentenced for six yea...
2231378,The Lemon Drop Kid,1951,2300000.0,[Comedy],"The Lemon Drop Kid , a New York City swindler,..."
595909,A Cry in the Dark,1988,6908797.0,"[Crime, Drama]",Seventh-day Adventist Church pastor Michael Ch...
...,...,...,...,...,...
2867597,Mr. Bill's Real Life Adventures,1986,,[Comedy],"An attempt to bring the famed ""Mr. Bill"" clay ..."
1096473,The Last Command,1928,,"[Drama, War]","In 1928 Hollywood, director Leo Andreyev look..."
35102018,Randy Parsons: American Luthier,2011,,"[Biography, Music]",American Luthier focuses on Randy Parsons’ tra...
8628195,Kabuliwala,1961,,[Drama],"Abdur Rehman Khan , a middle-aged dry fruit se..."


### Cleaning synopses.

In [26]:
cleaners = {
    'text_num': lambda _ : re.sub(r'\S*[^a-zA-Z\s\.\,0-9-]+\S*', '', _),
    'caps_words': lambda _ : re.sub(r'\S*[A-Z]+\S*', '', _),
    'punct': lambda _ : re.sub(r'[^\w\s]+', '', _),
    'numbers': lambda _ : re.sub(r'[0-9]+', '', _),
    'spaces': lambda _ : re.sub(r'\s+', ' ', _),
    'space_comma': lambda _ : re.sub(r'\s,', ',', _),
    'space_period': lambda _ : re.sub(r'\s\.', '.', _)
}

In [27]:
def clean(text):
    
    for k in ['text_num', 'spaces', 'space_comma', 'space_period']:
        
        text = cleaners[k](text)
        
    return text    

In [28]:
plot_data['clean'] = plot_data['synopsis'].parallel_apply(clean)
plot_data.head()

Unnamed: 0_level_0,name,year,revenue,genres,synopsis,clean
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23890098,Taxi Blues,1990,,[Drama],"Shlykov, a hard-working taxi driver and Lyosha...","Shlykov, a hard-working taxi driver and Lyosha..."
31186339,The Hunger Games,2012,686533290.0,"[Action, Adventure, Drama, Sci-Fi]",The nation of Panem consists of a wealthy Capi...,The nation of Panem consists of a wealthy Capi...
20663735,Narasimham,2000,,"[Action, Drama, Musical]",Poovalli Induchoodan is sentenced for six yea...,Poovalli Induchoodan is sentenced for six year...
2231378,The Lemon Drop Kid,1951,2300000.0,[Comedy],"The Lemon Drop Kid , a New York City swindler,...","The Lemon Drop Kid, a New York City swindler, ..."
595909,A Cry in the Dark,1988,6908797.0,"[Crime, Drama]",Seventh-day Adventist Church pastor Michael Ch...,Seventh-day Adventist Church pastor Michael Ch...


### Saving plot_data and cleaned IMDB_movie_details.json from spoilers dataset

In [29]:
pkl.dump(plot_data, open(f'{tmp_dir}/data.pkl', 'wb'))

In [30]:
def read_json_file(file):
    '''
    from https://stackoverflow.com/a/68942058/5242222
    '''
    with open(file, "r") as r:
        response = r.read()
        response = response.replace('\n', '')
        response = response.replace('}{', '},{')
        response = "[" + response + "]"
        return json.loads(response)

In [31]:
fn = f'{data_dir}/IMDB_movie_details.json'
df_imdb = pd.DataFrame.from_records(read_json_file(fn))
df_imdb = df_imdb[df_imdb['plot_synopsis']!=''].reset_index(drop=True)

df_imdb['clean'] = df_imdb['plot_synopsis'].parallel_apply(clean)
# df_imdb.insert(2, 'clean', df_imdb.pop('clean'))

df_imdb

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis,clean
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"[Action, Thriller]",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in...",Jack Ryan is on a in London with his family. H...
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,[Comedy],6.6,2013-11-01,Four boys around the age of 10 are friends in ...,Four boys around the age of 10 are friends in ...
2,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"[Adventure, Drama, Western]",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...,Fred Dobbs and Bob Curtin are down on their lu...
3,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"[Comedy, Drama, Romance]",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...,Jim McAllister is a much-admired high school h...
4,tt0286716,"Bruce Banner, a brilliant scientist with a clo...",2h 18min,"[Action, Sci-Fi]",5.7,2003-06-20,Bruce Banner (Eric Bana) is a research scienti...,Bruce Banner is a research scientist at a Berk...
...,...,...,...,...,...,...,...,...
1334,tt0120655,An abortion clinic worker with a special herit...,2h 10min,"[Adventure, Comedy, Drama]",7.3,1999-11-12,The film opens with a homeless man (Bud Cort) ...,The film opens with a homeless man on a desert...
1335,tt0276751,Twelve year old Marcus Brewer lives with his c...,1h 41min,"[Comedy, Drama, Romance]",7.1,2002-05-17,Will Freeman (Hugh Grant) is a 38-year-old bac...,Will Freeman is a 38-year-old bachelor who pri...
1336,tt0289879,Evan Treborn grows up in a small town with his...,1h 53min,"[Sci-Fi, Thriller]",7.7,2004-01-23,"In the year 1998, Evan Treborn (Ashton Kutcher...","In the year 1998, Evan Treborn who suffered se..."
1337,tt1723811,Brandon is a 30-something man living in New Yo...,1h 41min,[Drama],7.2,2012-01-13,"Brandon (Michael Fassbender) is a successful, ...","Brandon is a successful, handsome thirty-somet..."


In [32]:
pkl.dump(df_imdb, open(f'{tmp_dir}/df_imdb.pkl', 'wb'))