# Get the data

In [2]:
import os
import pandas as pd
import requests

## Best movies from IMDB

Simple scrape. This is on the edge of legal use of the website.

In [None]:
# Using request because without the proper HTTP header, I am getting Czech titles
url = "https://www.imdb.com/chart/top/"
response = requests.get(url, headers={
    "Accept-Language": "en"
})
raw_df = pd.read_html(response.content)[0]
raw_df

Unnamed: 0.1,Unnamed: 0,Rank & Title,IMDb Rating,Your Rating,Unnamed: 4
0,,1. The Shawshank Redemption (1994),9.2,12345678910 NOT YET RELEASED Seen,
1,,2. The Godfather (1972),9.2,12345678910 NOT YET RELEASED Seen,
2,,3. The Dark Knight (2008),9.0,12345678910 NOT YET RELEASED Seen,
3,,4. The Godfather Part II (1974),9.0,12345678910 NOT YET RELEASED Seen,
4,,5. 12 Angry Men (1957),8.9,12345678910 NOT YET RELEASED Seen,
...,...,...,...,...,...
245,,246. Dersu Uzala (1975),8.0,12345678910 NOT YET RELEASED Seen,
246,,247. Aladdin (1992),8.0,12345678910 NOT YET RELEASED Seen,
247,,248. Gandhi (1982),8.0,12345678910 NOT YET RELEASED Seen,
248,,249. The Help (2011),8.0,12345678910 NOT YET RELEASED Seen,


In [None]:
df = (
    raw_df.assign(
        rank = lambda df: df["Rank & Title"].apply(lambda t: t.split(".", 1)[0]),
        title_and_year = lambda df: df["Rank & Title"].apply(lambda t: t.split(".", 1)[1]),
        year = lambda df: df["title_and_year"].apply(lambda t: t.rsplit("(", 1)[1][:-1]),
        title = lambda df: df["title_and_year"].apply(lambda t: t.rsplit("(", 1)[0].strip()),
    )
    .drop(columns=["Unnamed: 0", "Rank & Title", "Your Rating", "Unnamed: 4", "title_and_year"])
    .rename(columns={"IMDb Rating": "rating"})
)[["rank", "title", "year", "rating"]]
df

Unnamed: 0,rank,title,year,rating
0,1,The Shawshank Redemption,1994,9.2
1,2,The Godfather,1972,9.2
2,3,The Dark Knight,2008,9.0
3,4,The Godfather Part II,1974,9.0
4,5,12 Angry Men,1957,8.9
...,...,...,...,...
245,246,Dersu Uzala,1975,8.0
246,247,Aladdin,1992,8.0
247,248,Gandhi,1982,8.0
248,249,The Help,2011,8.0


In [None]:
df.to_csv("top_movies.csv", index=False)

## All movie data from IMDB exports

As described in https://www.imdb.com/interfaces/

In [14]:
def download_titles():
    title_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
    title_basics_raw = pd.read_csv(title_basics_url, sep="\t", low_memory=False, na_values="\\N")
    title_basics = (
        title_basics_raw
        .query("((titleType=='movie') | (titleType=='tvMovie')) & (isAdult==0)")
        .drop(columns=["isAdult", "endYear"])
        .rename(columns={"startYear": "year"})
        .convert_dtypes()
        .astype({"year": "Int64", "runtimeMinutes": "Int64"})
        .reset_index(drop=True)
    )
    title_basics.to_parquet("imdb_movie_titles.parquet")
    return title_basics

if os.path.isfile("imdb_movie_titles.parquet"):
    movie_basics = pd.read_parquet("imdb_movie_titles.parquet")
else:
    movie_basics = download_titles()

In [15]:
movie_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,year,runtimeMinutes,genres
0,tt0000009,movie,Miss Jerry,Miss Jerry,1894,45,Romance
1,tt0000502,movie,Bohemios,Bohemios,1905,100,
2,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography"
3,tt0000591,movie,The Prodigal Son,L'enfant prodigue,1907,90,Drama
4,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,1907,,Drama
...,...,...,...,...,...,...,...
748519,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,2007,100,Documentary
748520,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,2015,66,Drama
748521,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,2013,,Comedy
748522,tt9916730,movie,6 Gunn,6 Gunn,2017,116,


In [17]:
def download_ratings(movie_ids):
    ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
    ratings_raw = pd.read_csv(ratings_url, sep="\t", low_memory=False, na_values="\\N")
    ratings = (
        ratings_raw[ratings_raw["tconst"].isin(movie_ids)]
        .convert_dtypes()
        .reset_index(drop=True)
    )
    ratings.to_parquet("imdb_movie_ratings.parquet")
    return ratings

if os.path.isfile("imdb_movie_ratings.parquet"):
    movie_ratings = pd.read_parquet("imdb_movie_ratings.parquet")
else:
    movie_ids = movie_basics["tconst"].to_list()
    movie_ratings = download_ratings(movie_ids=movie_ids)

In [18]:
movie_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000009,5.2,199
1,tt0000502,3.7,14
2,tt0000574,6.0,782
3,tt0000591,4.0,19
4,tt0000615,4.1,23
...,...,...,...
326066,tt9916362,6.4,4858
326067,tt9916428,3.8,14
326068,tt9916460,9.4,18
326069,tt9916538,8.3,6
