In [3]:
!py -m pip install BeautifulSoup4
!py -m pip install requests



In [2]:
from bs4 import BeautifulSoup
import requests

BASE_URL = 'https://en.wikipedia.org'
URL = '/wiki/List_of_highest-grossing_films'
HEADERS = {'User-Agent': 'Mozilla/5.0'}

response = requests.get(BASE_URL + URL, headers=HEADERS)
soup = BeautifulSoup(response.content, "html.parser")

table = soup.select_one('table', {'class': 'wikitable sortable plainrowheaders sticky-header col4right col5center col6center jquery-tablesorter'})
# print(table)


In [27]:
import time 

def get_more_data(link):
    time.sleep(1) 
    response = requests.get(BASE_URL + link, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")

    table = soup.select_one('table', {'class': 'infobox vevent'})
    
    director = revenue = country = language = None

    for i in table.select('tr')[2:]:
        arg = i.select('th')
        if not arg:
            continue
        else:
            arg = i.find('th').get_text(strip=True)
            if arg == 'Directed by':
                director = i.find('td').get_text(strip=True)
            if arg == 'Box office':
                revenue = i.find('td').get_text(strip=True)
            if arg == 'Countries':
                country = i.find('td').get_text(strip=True)
            if arg == 'Language':
                language = i.find('td').get_text(strip=True)
    return director, revenue, country, language

In [28]:
movies = []
for row in table.select("tr")[1:]:
    columns = row.find_all('td')
    title = row.find('a').get_text(strip=True)
    # print(columns)
    year = columns[3].get_text(strip=True)
    link = row.find('th').find('a')['href']

    director, revenue, country, language = get_more_data(link)
    
    movie = {'title': title, 'year': year, 'director': director, 'revenue': revenue, 'country': country, 'language': language}
    movies.append(movie)

In [34]:
import re

def clean_wikipedia_text(text):
    cleaned = re.sub(r'\[\w+\]', '', text)
    cleaned = re.sub(r'([a-z])([A-Z])', r'\1, \2', cleaned)
    cleaned = re.sub(r'[\$€£]', '', cleaned)

    return cleaned.strip()

def clean_movie_data(movie):
    for key, value in movie.items():
        if isinstance(value, str):
            movie[key] = clean_wikipedia_text(value)
        elif isinstance(value, list):
            movie[key] = [clean_wikipedia_text(v) for v in value]
    return movie

for movie in movies:
    clean_movie_data(movie)


In [38]:
import sqlite3

conn = sqlite3.connect('films.db')
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS films (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    title TEXT NOT NULL,
    release_year INTEGER,
    director TEXT,
    box_office TEXT,
    country TEXT,
    language TEXT
)
''')
conn.commit()

for movie in movies:
    cursor.execute('''
    INSERT INTO films (title, release_year, director, box_office, country, language)
    VALUES (?, ?, ?, ?, ?, ?)
    ''', (
        movie['title'],
        movie['year'],
        movie.get('director', 'Unknown'),
        movie.get('revenue', 'N/A'),
        movie.get('country', 'Unknown'),
        movie.get('language', 'Unknown')
    ))

conn.commit()
conn.close()


In [None]:
import json

with sqlite3.connect('films.db') as conn:
    cursor = conn.cursor()
    cursor.execute('SELECT * FROM films')
    data = cursor.fetchall()

json_data = []
for row in data:
    json_data.append({
        "id": row[0],
        "title": row[1],
        "release_year": row[2],
        "director": row[3],
        "box_office": row[4],
        "country": row[5]
    })

with open('films.json', 'w', encoding='utf-8') as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)



Exported to films.json
