# Get the data

In [None]:
import os
import pandas as pd
import requests

## Best movies from IMDB

Simple scrape. This is on the edge of legal use of the website.

In [None]:
# Using request because without the proper HTTP header, I am getting Czech titles
url = "https://www.imdb.com/chart/top/"
response = requests.get(url, headers={
    "Accept-Language": "en"
})
raw_df = pd.read_html(response.content)[0]
raw_df

Unnamed: 0.1,Unnamed: 0,Rank & Title,IMDb Rating,Your Rating,Unnamed: 4
0,,1. The Shawshank Redemption (1994),9.2,12345678910 NOT YET RELEASED Seen,
1,,2. The Godfather (1972),9.2,12345678910 NOT YET RELEASED Seen,
2,,3. The Dark Knight (2008),9.0,12345678910 NOT YET RELEASED Seen,
3,,4. The Godfather Part II (1974),9.0,12345678910 NOT YET RELEASED Seen,
4,,5. 12 Angry Men (1957),8.9,12345678910 NOT YET RELEASED Seen,
...,...,...,...,...,...
245,,246. Dersu Uzala (1975),8.0,12345678910 NOT YET RELEASED Seen,
246,,247. Aladdin (1992),8.0,12345678910 NOT YET RELEASED Seen,
247,,248. Gandhi (1982),8.0,12345678910 NOT YET RELEASED Seen,
248,,249. The Help (2011),8.0,12345678910 NOT YET RELEASED Seen,


In [None]:
df = (
    raw_df.assign(
        rank = lambda df: df["Rank & Title"].apply(lambda t: t.split(".", 1)[0]),
        title_and_year = lambda df: df["Rank & Title"].apply(lambda t: t.split(".", 1)[1]),
        year = lambda df: df["title_and_year"].apply(lambda t: t.rsplit("(", 1)[1][:-1]),
        title = lambda df: df["title_and_year"].apply(lambda t: t.rsplit("(", 1)[0].strip()),
    )
    .drop(columns=["Unnamed: 0", "Rank & Title", "Your Rating", "Unnamed: 4", "title_and_year"])
    .rename(columns={"IMDb Rating": "rating"})
)[["rank", "title", "year", "rating"]]
df

Unnamed: 0,rank,title,year,rating
0,1,The Shawshank Redemption,1994,9.2
1,2,The Godfather,1972,9.2
2,3,The Dark Knight,2008,9.0
3,4,The Godfather Part II,1974,9.0
4,5,12 Angry Men,1957,8.9
...,...,...,...,...
245,246,Dersu Uzala,1975,8.0
246,247,Aladdin,1992,8.0
247,248,Gandhi,1982,8.0
248,249,The Help,2011,8.0


In [None]:
df.to_csv("top_movies.csv", index=False)

## All movie data from IMDB exports

As described in https://www.imdb.com/interfaces/

In [None]:
def download_titles():
    title_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
    title_basics_raw = pd.read_csv(title_basics_url, sep="\t", low_memory=False, na_values="\\N")
    title_basics = (
        title_basics_raw
        .query("(titleType=='movie') & (isAdult==0)")
        .drop(columns=["isAdult", "endYear", "titleType"])
        .convert_dtypes()
        .astype({"startYear": "Int64", "runtimeMinutes": "Int64"})
    )
    title_basics.to_parquet("imdb_movie_titles.parquet")
    return title_basics

if os.path.isfile("imdb_movie_titles.parquet"):
    movie_basics = pd.read_parquet("imdb_movie_titles.parquet")
else:
    movie_basics = download_titles()



In [None]:
movie_basics

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
8,tt0000009,Miss Jerry,Miss Jerry,1894,45,Romance
498,tt0000502,Bohemios,Bohemios,1905,100,
570,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography"
587,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90,Drama
610,tt0000615,Robbery Under Arms,Robbery Under Arms,1907,,Drama
...,...,...,...,...,...,...
9185668,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,57,Documentary
9185695,tt9916680,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,2007,100,Documentary
9185707,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy
9185718,tt9916730,6 Gunn,6 Gunn,2017,116,


In [None]:
def download_ratings(movie_ids):
    ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
    ratings_raw = pd.read_csv(ratings_url, sep="\t", low_memory=False, na_values="\\N")
    ratings = ratings_raw[ratings_raw["tconst"].isin(movie_ids)].convert_dtypes()
    ratings.to_parquet("imdb_movie_ratings.parquet")
    return ratings

if os.path.isfile("imdb_movie_ratings.parquet"):
    movie_ratings = pd.read_parquet("imdb_movie_ratings.parquet")
else:
    movie_ids = movie_basics["tconst"].to_list()
    movie_ratings = download_ratings(movie_ids=movie_ids)

In [None]:
movie_ratings

Unnamed: 0,tconst,averageRating,numVotes
8,tt0000009,5.2,199
334,tt0000502,3.7,14
366,tt0000574,6.0,782
374,tt0000591,4.0,19
388,tt0000615,4.1,23
...,...,...,...
1258442,tt9916270,5.8,1354
1258446,tt9916362,6.4,4855
1258450,tt9916428,3.8,14
1258454,tt9916538,8.3,6


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=234de414-c5f7-4e4d-a314-25100ac19112' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>