In [1]:
from bs4 import BeautifulSoup
import requests
import re

In [2]:
HEADERS = {"User-Agent": "Mozilla/5.0"}
BASE_URL = "https://en.wikipedia.org"
HIGHEST_GROSSING_FILMS_URL = "/wiki/List_of_highest-grossing_films"

In [3]:
page = requests.get(BASE_URL + HIGHEST_GROSSING_FILMS_URL, headers=HEADERS)
page.status_code

200

In [4]:
soup = BeautifulSoup(page.content, "lxml")

In [5]:
highest_grossing_films_table = soup.find("table", {"class": "wikitable sortable plainrowheaders sticky-header col4right col5center col6center"})

In [6]:
film_titles = [highest_grossing_films_table.select("tr")[i].select("th")[0].a.text for i in range(1, len(highest_grossing_films_table.select("tr")))]

In [7]:
film_release_years = [highest_grossing_films_table.select("tr")[i].select("td")[3].text.strip() for i in range(1, len(highest_grossing_films_table.select("tr")))]

In [8]:
film_links = [BASE_URL + highest_grossing_films_table.select("tr")[i].select("th")[0].a.get("href") for i in range(1, len(highest_grossing_films_table.select("tr")))]

In [10]:
directors = []

for link in film_links:
    film_page = requests.get(link, headers=HEADERS)
    film_soup = BeautifulSoup(film_page.content, "lxml")
    raw_string_director = film_soup.find("th", string="Directed by").find_next_sibling("td").text
    current_directors = re.sub(r'([a-z])([A-Z])',  r'\1,\2', re.sub(r'\[\d+\]', ",", raw_string_director.replace("\n", ""))).split(",")
    directors.append(current_directors)

In [11]:
films_box_office_revenues = [re.sub(r'^.*\$', '$', highest_grossing_films_table.select("tr")[i].select("td")[2].text.strip()) for i in range(1, len(highest_grossing_films_table.select("tr")))]

In [12]:
film_countries = []

for link in film_links:
    film_page = requests.get(link, headers=HEADERS)
    film_soup = BeautifulSoup(film_page.content, "lxml")
    country_td = film_soup.find("th", string=re.compile(r"Country|Countries")).find_next_sibling("td")
    
    # Extract the text from the <td> element
    if country_td.find("ul"):
    # If the countries are listed in a <ul> list
        current_countries = [li.get_text() for li in country_td.find_all("li")]
    else:
    # If the countries are separated by <br> tags
        current_countries = country_td.get_text(",", strip=True).split(",")
    
    current_countries = [re.sub(r'([a-z])([A-Z])',  r'\1,\2', re.sub(r'\[\d+\]', '', current_country.replace("\n", ""))) for current_country in current_countries]
    
    film_countries.append(current_countries)

In [13]:
import sqlite3
import json

In [14]:
connection = sqlite3.connect("films.db")
cursor = connection.cursor()

In [15]:
create_films_table = """CREATE TABLE IF NOT EXISTS 
films(
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    title TEXT NOT NULL,
    release_year INTEGER,
    director TEXT,
    box_office_revenue TEXT,
    country TEXT
)
"""
cursor.execute(create_films_table)

<sqlite3.Cursor at 0x10deb39c0>

In [16]:
for title, release_year, directors, revenue, country in zip(film_titles, film_release_years, directors, films_box_office_revenues, film_countries):
    cursor.execute("INSERT INTO films (title, release_year, director, box_office_revenue, country) VALUES (?, ?, ?, ?, ?)", (title, release_year, ", ".join(directors), revenue, ", ".join(country)))

In [17]:
# Writing to a JSON file
cursor.execute("SELECT * FROM films")

films_data = cursor.fetchall()

In [18]:
# Fetch column names
column_names = [description[0] for description in cursor.description]

# Transform films_data into a list of dictionaries
films_list = [dict(zip(column_names, film)) for film in films_data]

# Write to a JSON file
with open("films.json", "w") as file:
    json.dump(films_list, file, indent=4)