# Preprocess film input data


In [14]:
# setup cell
import time
import uuid
import json
import os
import requests
import random
import gzip
import calendar
from datetime import datetime
import pandas as pd
import urllib.request
from urllib.parse import urlparse
import re

def download_file(url, file_path):
    if not os.path.exists(file_path):
        print(f"Downloading {url}...")
        urllib.request.urlretrieve(url, file_path)
        urllib.request.urlretrieve(url, file_path)
        print("Download complete.")
    else:
        print("File already exists. Skipping download.")

def get_keyword_url(film_id):
    assert film_id
    return f"https://www.imdb.com/title/{film_id}/keywords/?ref_=tt_stry_kw"

def get_location_url(film_id):
    assert film_id
    return f"https://www.imdb.com/title/{film_id}/locations/?ref_=tt_dt_loc"
    
def load_large_tsv_gz(file_path):
    print(f"Loading {file_path}...")
    with gzip.open(file_path, 'rt', encoding="utf8") as file:
        df = pd.read_csv(file, delimiter='\t', low_memory=False)
    return df

def get_film_id_from_url(url):
    film_id = url.split('/')[4]
    assert film_id[0:2] == 'tt'
    return film_id

## Download data

In [2]:
# FILE: title.basics.tsv.gz
# tconst	titleType	primaryTitle	originalTitle	isAdult	startYear	endYear	runtimeMinutes	genres

# Data source: https://developer.imdb.com/non-commercial-datasets/
film_fn = os.path.join(os.path.dirname(__name__), "title.basics.tsv.gz")
ratings_fn = os.path.join(os.path.dirname(__name__), "title.ratings.tsv.gz")
download_file("https://datasets.imdbws.com/title.basics.tsv.gz", film_fn)
download_file("https://datasets.imdbws.com/title.ratings.tsv.gz", ratings_fn)
download_file("https://datasets.imdbws.com/name.basics.tsv.gz", os.path.join(os.path.dirname(__name__), "name.basics.tsv.gz"))


File already exists. Skipping download.
File already exists. Skipping download.
File already exists. Skipping download.


## Preprocess film data for scraping

In [3]:
# Load all the movies basic dataset - check the data distribution for titleType
all_movies_df = load_large_tsv_gz(film_fn)
print(all_movies_df['titleType'].value_counts())

Loading title.basics.tsv.gz...
titleType
tvEpisode       8210245
short            991346
movie            678796
video            291238
tvSeries         261721
tvMovie          145235
tvMiniSeries      53894
tvSpecial         47388
videoGame         38089
tvShort           10293
tvPilot               1
Name: count, dtype: int64


In [4]:
# Filter data for titleType = movies or tvseries only
cond = ((all_movies_df["titleType"] == 'movie') | (all_movies_df["titleType"] == 'tvSeries'))
movies_df = all_movies_df[cond]
movies_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama


In [7]:
# Load ratings data
all_films_df = load_large_tsv_gz(ratings_fn)
print(all_films_df.columns)
print(len(all_films_df))

Loading title.ratings.tsv.gz...
Index(['tconst', 'averageRating', 'numVotes'], dtype='object')
1430814


In [8]:
# Filter rating dataset for filtered-titleType (movie) only
films_df = all_films_df[all_films_df['tconst'].isin(movies_df['tconst'])]
print(len(films_df))
films_df.head()

404007


Unnamed: 0,tconst,averageRating,numVotes
8,tt0000009,5.4,211
144,tt0000147,5.2,512
338,tt0000502,4.4,17
372,tt0000574,6.0,891
380,tt0000591,5.4,24


In [9]:
print("NUM VOTES distribution:")
films_df['numVotes'].quantile([0, 0.25, 0.5, 0.75, .8, .85, .9, .95, .99, 1])

NUM VOTES distribution:


0.00          5.0
0.25         17.0
0.50         53.0
0.75        265.0
0.80        419.0
0.85        728.0
0.90       1536.0
0.95       4892.0
0.99      60552.4
1.00    2887541.0
Name: numVotes, dtype: float64

In [10]:
# generate file to scrape
films_to_scrape_df = films_df.query('numVotes > 200').sort_values(by='numVotes', ascending=False)
print("films n =", len(films_to_scrape_df))

SAMPLE_N = 5000

films_to_scrape_df.to_csv(os.path.join(os.path.dirname(__name__), "data/films_to_scrape.csv"))
films_to_scrape_df.sample(SAMPLE_N).sort_values(by='numVotes', ascending=False).to_csv(os.path.join(os.path.dirname(__name__), "data/films_to_scrape_sample.csv"))

films n = 115219


**movie and tv series distribution in sample file**

In [12]:
temp = pd.read_csv((os.path.join(os.path.dirname(__name__), "data/films_to_scrape_sample.csv")))
right_merged = pd.merge(all_movies_df, temp, how="right", on=["tconst"])
print(right_merged.shape)
right_merged['titleType'].value_counts()

(5000, 12)


titleType
movie       4193
tvSeries     807
Name: count, dtype: int64

## Create empty databse from sqlite db

In [15]:
import sqlite3

db_conn = sqlite3.connect(os.path.join(os.path.dirname(__name__), "data/films.db"))