In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from tqdm import tqdm
import json
import concurrent.futures
import multiprocessing
import time

MAX_THREADS = 30

In [2]:
title_df = pd.read_csv('zippedData/title.basics.tsv.gz', delimiter='\t', error_bad_lines=False)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
title_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
title_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950519 entries, 0 to 8950518
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 614.6+ MB


In [5]:
title_df = title_df.query('titleType == "movie"')

In [6]:
title_df.startYear = title_df.startYear.apply(lambda x: np.nan if x == '\\N' else int(x))

In [7]:
title_df.dropna(inplace=True)

In [8]:
title_df.reset_index(drop=True, inplace=True)
title_df = title_df.query('isAdult == 0')
title_df.drop(labels=['endYear', 'isAdult'], axis=1, inplace=True)
title_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
0,tt0000502,movie,Bohemios,Bohemios,1905.0,100,\N
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,1906.0,70,"Action,Adventure,Biography"
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,1907.0,90,Drama
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,1907.0,\N,Drama
4,tt0000630,movie,Hamlet,Amleto,1908.0,\N,Drama


In [9]:
title_df = title_df.query('startYear >= 2010 and startYear < 2022').sort_values('startYear').reset_index(drop=True)
title_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
0,tt1816571,movie,Lyubov pod prikrytiem,Lyubov pod prikrytiem,2010.0,94,Drama
1,tt1082075,movie,Pusher,Pusher,2010.0,107,"Action,Crime,Drama"
2,tt1774264,movie,23,23,2010.0,104,"Action,Comedy"
3,tt1774269,movie,A Backyard Story,A Backyard Story,2010.0,70,Drama
4,tt1774281,movie,Aku atau Dia?,Aku atau Dia?,2010.0,97,"Comedy,Romance"


In [10]:
title_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190364 entries, 0 to 190363
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          190364 non-null  object 
 1   titleType       190364 non-null  object 
 2   primaryTitle    190364 non-null  object 
 3   originalTitle   190364 non-null  object 
 4   startYear       190364 non-null  float64
 5   runtimeMinutes  190364 non-null  object 
 6   genres          190364 non-null  object 
dtypes: float64(1), object(6)
memory usage: 10.2+ MB


In [24]:
full_details_df = title_df
full_details_df['budget'] = np.nan
full_details_df['domestic_box_office'] = np.nan
full_details_df['worldwide_box_office'] = np.nan
full_details_df['origin'] = np.nan
full_details_df['genres'] = np.nan
full_details_df['user_rating'] = np.nan
full_details_df['principals'] =  np.nan
full_details_df['director'] =  np.nan
full_details_df['rating'] =  np.nan
full_details_df['published_date'] =  np.nan
full_details_df['creator'] =  np.nan

In [25]:
def get_data_from_dataid(dataid, final_data_holder, soup, isNumber=False):
    data_li = soup.select(dataid)
    data = [y.get_text() for x in data_li for y in x.find_all(final_data_holder, {'class':"ipc-metadata-list-item__list-content-item"})]
    
    if len(data) > 0:
        if len(data) > 4:
            data = data[:4]
        if isNumber:
            data = int(''.join([x for x in data[0] if x.isnumeric()]))
        return data
    else:
        return

In [26]:
def download_url(index):
    row = full_details_df.iloc[index]
    title_id = row.tconst
    url = f'https://imdb.com/title/{title_id}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    metadata = soup.find('script')
    try:
        meta_json = json.loads(metadata.contents[0])
        meta_keys = meta_json.keys()
        row['genres'] = meta_json['genre'] if 'genre' in meta_keys else np.nan
        row['user_rating'] = meta_json['aggregateRating']['ratingValue'] if 'aggregateRating' in meta_keys else np.nan
        row['principals'] = [{'name':x['name'], 'id':x['url'][-10:-1]} for x in meta_json['actor']] if 'actor' in meta_keys else np.nan
        row['director'] = [{'name':x['name'], 'id':x['url'][-10:-1]} for x in meta_json['director']] if 'director' in meta_keys else np.nan
        row['rating'] = meta_json['contentRating'] if 'contentRating' in meta_keys else np.nan
        row['published_date'] = meta_json['datePublished'] if 'datePublished' in meta_keys else np.nan
        row['creator'] = [{'id':x['url'][-10:-1]} for x in meta_json['creator']] if 'creator' in meta_keys else np.nan
    except:
        pass
    

    row['budget'] = get_data_from_dataid('li[data-testid="title-boxoffice-budget"]', 'span', soup, isNumber = True)
    row['domestic_box_office'] = get_data_from_dataid('li[data-testid="title-boxoffice-grossdomestic"]', 'span', soup, isNumber = True)
    row['worldwide_box_office'] = get_data_from_dataid('li[data-testid="title-boxoffice-cumulativeworldwidegross"]', 'span', soup, isNumber = True)
    row['origin'] = get_data_from_dataid('li[data-testid="title-details-origin"]', 'a', soup)
    
    
    print(index, end="\r")
    full_details_df.iloc[index] = row
    
    time.sleep(0.25)
    
    

In [27]:
num_processes = multiprocessing.cpu_count()
num_processes

8

In [30]:
pd.options.mode.chained_assignment = None

indices = [range(10)]

threads = min(MAX_THREADS, len(indices))
    
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
    executor.map(download_url, indices)

range(0, 10)

In [33]:
full_details_df.iloc[:10]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,budget,domestic_box_office,worldwide_box_office,origin,user_rating,principals,director,rating,published_date,creator
0,tt1816571,movie,Lyubov pod prikrytiem,Lyubov pod prikrytiem,2010.0,94,,,,,,,,,,,
1,tt1082075,movie,Pusher,Pusher,2010.0,107,,,,,,,,,,,
2,tt1774264,movie,23,23,2010.0,104,,,,,,,,,,,
3,tt1774269,movie,A Backyard Story,A Backyard Story,2010.0,70,,,,,,,,,,,
4,tt1774281,movie,Aku atau Dia?,Aku atau Dia?,2010.0,97,,,,,,,,,,,
5,tt1774283,movie,Am I Alone?,Am I Alone?,2010.0,102,,,,,,,,,,,
6,tt1774286,movie,Ángel caído,Ángel caído,2010.0,101,,,,,,,,,,,
7,tt1774295,movie,Awaken,Awaken,2010.0,84,,,,,,,,,,,
8,tt1774298,movie,Balls Count Anywhere,Balls Count Anywhere,2010.0,\N,,,,,,,,,,,
9,tt1774306,movie,Bohlol Dana - A Sage of Baghdad,Bohlol Dana - A Sage of Baghdad,2010.0,117,,,,,,,,,,,
