In [2]:
import requests
%pip install selenium webdriver-manager beautifulsoup4 pandas
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os, re, time


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
TOP250_URL = "https://www.imdb.com/chart/top/"
HEADERS = {"User-Agent": "Mozilla/5.0"}

def get_driver():
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("user-agent=Mozilla/5.0")
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=opts)


def scrape_movies():
    driver = get_driver()

    try:
        driver.get(TOP250_URL)
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        items = soup.select("li.ipc-metadata-list-summary-item")

        rows = []

        for rank, movie_item in enumerate(items, start=1):
            movie_link_element = movie_item.select_one('a.ipc-title-link-wrapper')
            if not movie_link_element:
                continue

            title_el = movie_link_element.select_one('h3')
            movie_title = title_el.get_text(strip=True) if title_el else None

            movie_href = movie_link_element.get('href', '')

            movie_id_match = re.search(r"/title/(tt\d+)/", movie_href)
            movie_tconst = movie_id_match.group(1) if movie_id_match else None

            movie_url = f"https://www.imdb.com/title/{movie_tconst}/" if movie_tconst else None

            # Extracting rating
            rating_element = movie_item.select_one("span.ipc-rating-star--rating")
            movie_rating = rating_element.get_text(strip=True) if rating_element else None

            rows.append({
                "title": movie_title,
                "rating": movie_rating,
                "rank": rank,
                "url": movie_url,
                "movie_id": movie_tconst,
            })

        return rows[:250]
    finally:
        driver.quit()

def get_credits_from_movie(url, top_n=3):
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        r.raise_for_status()
    except requests.RequestException as e:
        print(f"Fetch error for {url}: {e}")
        return []

    soup = BeautifulSoup(r.text, "html.parser")

    for block in soup.select('li[data-testid="title-pc-principal-credit"]'):
        label = block.select_one(".ipc-metadata-list-item__label")
        label_text = label.get_text(strip=True)

        if "Director" in label_text:
            directors = [a.get_text(strip=True) for a in block.select('a[href^="/name/"]')]

        elif "Stars" in label_text:
            stars = [a.get_text(strip=True) for a in block.select('a[href^="/name/"]')]
            seen, out = set(), []
            for n in stars:
                if n not in seen:
                    seen.add(n); out.append(n)
            stars = out[:top_n]
    return {"directors": directors, "stars": stars}

def main():
    movies = scrape_movies()

    for i, movie in enumerate(movies, start=1):
        if not movie.get("url"):
            movie["stars"] = ""
            continue
        credits = get_credits_from_movie(movie["url"])
        movie["stars"] = ", ".join(credits["stars"])
        movie["directors"] = ", ".join(credits["directors"])

    df = pd.DataFrame(movies, columns=["rank","title","rating","url","movie_id","stars", "directors"])

    out_path = os.path.abspath("imdb_top250.csv")
    df.to_csv(out_path, index=False)
    print(f"✅ Saved {len(df)} rows to:\n{out_path}")

    for row in movies[:5]:
        print(f"{row['rank']:>3}. {row['title']} → {row['stars']}")

if __name__ == "__main__":
    main()

✅ Saved 250 rows to:
/Users/Mari.Piiriste/TESTING/Social-Graphs-and-Interactions-1/imdb_top250.csv
  1. The Shawshank Redemption → Tim Robbins, Morgan Freeman, Bob Gunton
  2. The Godfather → Marlon Brando, Al Pacino, James Caan
  3. The Dark Knight → Christian Bale, Heath Ledger, Aaron Eckhart
  4. The Godfather Part II → Al Pacino, Robert Duvall, Diane Keaton
  5. 12 Angry Men → Martin Balsam, John Fiedler, Lee J. Cobb
