In [None]:
import requests
import json
import csv
import re
import time
from bs4 import BeautifulSoup

HEADERS = {"User-Agent": "Mozilla/5.0", "Accept-Language": "en-US,en;q=0.9"}
CHART_URL = "https://www.imdb.com/chart/top/"
DELAY_SECONDS = 0.5

session = requests.Session()
session.headers.update(HEADERS)


def _get_text_or(elem):
    return elem.get_text(strip=True) if elem else "N/A"


def _iso8601_to_human(iso):
    """Convert ISO 8601 duration like PT2H22M to '2h 22m' (fallback to raw if parsing fails)."""
    if not iso:
        return "N/A"
    m = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?", iso)
    if not m:
        return iso
    hours = m.group(1)
    mins = m.group(2)
    parts = []
    if hours:
        parts.append(f"{hours}h")
    if mins:
        parts.append(f"{mins}m")
    return " ".join(parts) if parts else "N/A"


def extract_top_list():
    """Fetch Top chart page and return list of nodes from __NEXT_DATA__ JSON."""
    r = session.get(CHART_URL)
    r.raise_for_status()
    html = r.text

    m = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html, re.S)
    if not m:
        raise RuntimeError("Could not locate __NEXT_DATA__ JSON on chart page.")
    data = json.loads(m.group(1))

    try:
        edges = data["props"]["pageProps"]["pageData"]["chartTitles"]["edges"]
    except Exception as e:
        raise RuntimeError("Unexpected JSON structure for Top list.") from e

    items = []
    for edge in edges:
        node = edge.get("node", {}) or {}
        movie_id = node.get("id")
        title = node.get("titleText", {}).get("text", "N/A")
        year = node.get("releaseYear", {}).get("year", "N/A")
        rating = node.get("ratingsSummary", {}).get("aggregateRating", "N/A")
        votes = node.get("ratingsSummary", {}).get("voteCount", "N/A")
        items.append({
            "id": movie_id,
            "title": title,
            "year": year,
            "rating": rating,
            "votes": votes
        })
    return items


def scrape_movie_page(url):
    """Visit an individual movie page and extract runtime, metascore, certificate, description, votes (fallback)."""
    try:
        r = session.get(url)
        r.raise_for_status()
    except Exception:
        return {"runtime": "N/A", "metascore": "N/A", "certificate": "N/A", "description": "N/A", "votes": "N/A"}

    soup = BeautifulSoup(r.text, "html.parser")

    runtime = "N/A"
    metascore = "N/A"
    certificate = "N/A"
    description = "N/A"
    votes = "N/A"

    for script in soup.find_all("script", type="application/ld+json"):
        if not script.string:
            continue
        try:
            jd = json.loads(script.string)
        except Exception:
            continue
        if isinstance(jd, dict) and jd.get("@type") in ("Movie", "TVSeries", "CreativeWork"):
            # description
            description = jd.get("description") or description
            dur = jd.get("duration")
            if dur:
                runtime = _iso8601_to_human(dur)
            cert = jd.get("contentRating")
            if cert:
                certificate = cert
            ar = jd.get("aggregateRating") or {}
            if ar:
                if not votes:
                    votes = ar.get("ratingCount") or ar.get("reviewCount") or votes
            break

    if description in (None, "", "N/A"):
        d = soup.select_one("span[data-testid='plot-xl'], span[data-testid='plot-l'], span[data-testid='plot-xs_to_m']")
        if d:
            description = d.get_text(strip=True)

    if runtime in (None, "", "N/A"):
        rt_tag = soup.find("li", {"data-testid": "title-techspec_runtime"})
        if rt_tag:
            span = rt_tag.find("div") or rt_tag.find("span")
            if span:
                runtime = span.get_text(strip=True)
        if runtime in (None, "", "N/A"):
            t = soup.find("time")
            if t:
                runtime = t.get_text(strip=True)

    if certificate in (None, "", "N/A"):
        cert_tag = soup.find("li", {"data-testid": "title-details-certificate"})
        if cert_tag:
            a = cert_tag.find("a")
            if a:
                certificate = a.get_text(strip=True)

        if certificate in (None, "", "N/A"):
            meta_cert = soup.find("div", class_=re.compile(r"TitleBlockMetaData.*", re.I))
            if meta_cert:
                abbr = meta_cert.find("a")
                if abbr:
                    txt = abbr.get_text(strip=True)
                    if re.match(r"^[A-Za-z0-9-]+$", txt):
                        certificate = txt

    if metascore in (None, "", "N/A"):
        mtag = soup.find("span", class_="score-meta")
        if mtag:
            metascore = mtag.get_text(strip=True)
        else:
            mtag2 = soup.select_one("div.metacriticScore span")
            if mtag2:
                metascore = mtag2.get_text(strip=True)

    if votes in (None, "", "N/A"):
        vtag = soup.select_one("div[data-testid='hero-rating-bar__aggregate-rating__score'] ~ div[data-testid='hero-rating-bar__vote-count']")
        if vtag:
            votes = vtag.get_text(strip=True)
        else:
            a_tag = soup.select_one("div.imdbRating span[name='nv']")
            if a_tag:
                votes = a_tag.get_text(strip=True)
            else:
                vtag2 = soup.select_one("span[itemprop='ratingCount']")
                if vtag2:
                    votes = vtag2.get_text(strip=True)

    runtime = runtime if runtime else "N/A"
    metascore = metascore if metascore else "N/A"
    certificate = certificate if certificate else "N/A"
    description = description if description else "N/A"
    votes = votes if votes else "N/A"

    return {
        "runtime": runtime,
        "metascore": metascore,
        "certificate": certificate,
        "description": description,
        "votes": votes,
    }


def main_save_csv(filename):
    print("Starting IMDb Top list scrape... (this may take a few minutes)")

    items = extract_top_list()
    rows = []
    total = len(items)

    for idx, it in enumerate(items, start=1):
        movie_id = it.get("id")
        link = f"https://www.imdb.com/title/{movie_id}/" if movie_id else "N/A"

        details = scrape_movie_page(link) if link != "N/A" else {
            "runtime": "N/A", "metascore": "N/A", "certificate": "N/A", "description": "N/A", "votes": it.get("votes", "N/A")
        }

        votes = details.get("votes") if details.get("votes") and details.get("votes") != "N/A" else it.get("votes", "N/A")

        rows.append([
            idx,
            it.get("title", "N/A"),
            it.get("year", "N/A"),
            it.get("rating", "N/A"),
            votes,
            details.get("runtime", "N/A"),
            details.get("metascore", "N/A"),
            details.get("certificate", "N/A"),
            details.get("description", "N/A"),
            link
        ])

        time.sleep(DELAY_SECONDS)

    with open(f"{filename}.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Rank", "Title", "Year", "IMDb Rating", "Votes", "Runtime", "Metascore", "Certificate", "Description", "Link"])
        writer.writerows(rows)

    print(f"Done! Scraped {len(rows)} movies and saved to {filename}.csv")


if __name__ == "__main__":
    fname = input("Please enter a filename (no extension): ").strip() or "imdb_top250"
    main_save_csv(fname)


Starting IMDb Top list scrape... (this may take a few minutes)
✅ Done! Scraped 250 movies and saved to one.csv


In [None]:
import pandas as pd
import numpy
import re
from collections import Counter

In [None]:
data = pd.read_csv("/content/one.csv")
data = data.drop(columns=[ 'Metascore'])

In [None]:
data.shape

(250, 9)

In [None]:
data

Unnamed: 0,Rank,Title,Year,IMDb Rating,Votes,Runtime,Certificate,Description,Link
0,1,The Shawshank Redemption,1994,9.3,3081689,2h 22m,R,A banker convicted of uxoricide forms a friend...,https://www.imdb.com/title/tt0111161/
1,2,The Godfather,1972,9.2,2149202,2h 55m,R,The aging patriarch of an organized crime dyna...,https://www.imdb.com/title/tt0068646/
2,3,The Dark Knight,2008,9.1,3056543,2h 32m,PG-13,When a menace known as the Joker wreaks havoc ...,https://www.imdb.com/title/tt0468569/
3,4,The Godfather Part II,1974,9.0,1445413,3h 22m,R,The early life and career of Vito Corleone in ...,https://www.imdb.com/title/tt0071562/
4,5,12 Angry Men,1957,9.0,941911,1h 36m,Approved,The jury in a New York City murder trial is fr...,https://www.imdb.com/title/tt0050083/
...,...,...,...,...,...,...,...,...,...
245,246,Groundhog Day,1993,8.0,722733,1h 41m,PG,"A narcissistic, self-centered weatherman finds...",https://www.imdb.com/title/tt0107048/
246,247,The Help,2011,8.1,517221,2h 26m,PG-13,An aspiring author during the civil rights mov...,https://www.imdb.com/title/tt1454029/
247,248,Drishyam,2015,8.2,103138,2h 43m,Not Rated,Desperate measures are taken by a man who trie...,https://www.imdb.com/title/tt4430212/
248,249,Gangs of Wasseypur,2012,8.2,109337,5h 21m,Not Rated,A clash between Sultan and Shahid Khan leads t...,https://www.imdb.com/title/tt1954470/


In [None]:
print("Mean:")
print(data.mean(numeric_only=True))
print("\nMedian:")
print(data.median(numeric_only=True))
print("\nVariance:")
print(data.var(numeric_only=True))

Mean:
Rank              125.5000
Year             1988.5320
IMDb Rating         8.3092
Votes          740394.6600
dtype: float64

Median:
Rank              125.5
Year             1995.0
IMDb Rating         8.2
Votes          620145.5
dtype: float64

Variance:
Rank           5.229167e+03
Year           6.439207e+02
IMDb Rating    5.393108e-02
Votes          3.589800e+11
dtype: float64


In [None]:
def parse_min_age(cert):
    """
    Return a numeric minimum age if inferrable, otherwise None.
    Examples:
      'PG-13' -> 13
      'R' -> 17
      '12A' -> 12
      '18+' -> 18
    """
    if pd.isna(cert):
        return None
    s = str(cert).strip().upper()

    mapping = {
        "G": 0, "U": 0, "PG": 10, "PG-13": 13, "PG13": 13,
        "R": 17, "NC-17": 17, "NC17": 17, "NOT RATED": None,
        "APPROVED": None, "TV-MA": 17, "TV-14": 14
    }

    for key, age in mapping.items():
        if s == key or s.startswith(key + " ") or s.startswith(key + "/") or s.startswith(key + "-"):
            return age

    m = re.search(r'(\d{1,2})', s)
    if m:
        try:
            return int(m.group(1))
        except:
            return None

    if any(tok in s for tok in ("ADULT", "X", "XXX")):
        return 18

    return None


In [None]:
cert_col = "Certificate"
data['min_age'] = data[cert_col].apply(parse_min_age)
data['is_16_plus'] = data['min_age'].apply(lambda x: True if (x is not None and x >= 16) else False)

# Quick checks
print(data[['Certificate','min_age','is_16_plus']].head(8))
print("\nCounts:\n", data['is_16_plus'].value_counts(dropna=False))


  Certificate  min_age  is_16_plus
0           R     17.0        True
1           R     17.0        True
2       PG-13     10.0       False
3           R     17.0        True
4    Approved      NaN       False
5       PG-13     10.0       False
6           R     17.0        True
7           R     17.0        True

Counts:
 is_16_plus
False    147
True     103
Name: count, dtype: int64


In [None]:
data

Unnamed: 0,Rank,Title,Year,IMDb Rating,Votes,Runtime,Certificate,Description,Link,min_age,is_16_plus
0,1,The Shawshank Redemption,1994,9.3,3081689,2h 22m,R,A banker convicted of uxoricide forms a friend...,https://www.imdb.com/title/tt0111161/,17.0,True
1,2,The Godfather,1972,9.2,2149202,2h 55m,R,The aging patriarch of an organized crime dyna...,https://www.imdb.com/title/tt0068646/,17.0,True
2,3,The Dark Knight,2008,9.1,3056543,2h 32m,PG-13,When a menace known as the Joker wreaks havoc ...,https://www.imdb.com/title/tt0468569/,10.0,False
3,4,The Godfather Part II,1974,9.0,1445413,3h 22m,R,The early life and career of Vito Corleone in ...,https://www.imdb.com/title/tt0071562/,17.0,True
4,5,12 Angry Men,1957,9.0,941911,1h 36m,Approved,The jury in a New York City murder trial is fr...,https://www.imdb.com/title/tt0050083/,,False
...,...,...,...,...,...,...,...,...,...,...,...
245,246,Groundhog Day,1993,8.0,722733,1h 41m,PG,"A narcissistic, self-centered weatherman finds...",https://www.imdb.com/title/tt0107048/,10.0,False
246,247,The Help,2011,8.1,517221,2h 26m,PG-13,An aspiring author during the civil rights mov...,https://www.imdb.com/title/tt1454029/,10.0,False
247,248,Drishyam,2015,8.2,103138,2h 43m,Not Rated,Desperate measures are taken by a man who trie...,https://www.imdb.com/title/tt4430212/,,False
248,249,Gangs of Wasseypur,2012,8.2,109337,5h 21m,Not Rated,A clash between Sultan and Shahid Khan leads t...,https://www.imdb.com/title/tt1954470/,,False


In [None]:
genre_keywords = {
    "Action": ["action","fight","battle","explosion","chase","soldier","warrior","assassin","combat","gun","gunfight"],
    "Drama": ["drama","family","relationship","journey","life","struggle","emotional","emotions","courtroom","biographical"],
    "Crime": ["crime","criminal","mafia","gang","gangster","detective","police","heist","murder","investigation","corrupt"],
    "Comedy": ["comedy","funny","humor","hilarious","satire","comic","parody","laugh"],
    "Romance": ["romance","romantic","love","affair","lover","heart","relationship"],
    "Sci-Fi": ["science fiction","sci-fi","space","alien","robot","future","time travel","android","cyborg"],
    "Fantasy": ["fantasy","magic","wizard","dragon","myth","sorcerer","kingdom","fairy"],
    "Horror": ["horror","ghost","haunt","terrifying","demon","monster","killer","serial killer","psychopath"],
    "Animation": ["animated","animation","anime","cartoon","pixar","studio ghibli"],
    "Thriller": ["thriller","suspense","psychological","twist","conspiracy","tension"],
    "Biography": ["biography","biographical","based on a true story","biopic","life of"],
    "History": ["history","historical","period","period piece","empire","revolution"],
    "War": ["war","battle","soldier","military","veteran"],
    "Western": ["western","cowboy","outlaw","sheriff","frontier"],
    "Mystery": ["mystery","mysterious","clues","investigation","detective"],
    "Musical": ["musical","song","singer","dance","dancing","orchestra"],
    "Documentary": ["documentary","documentaries","true story","real life","interview"]
}

gen_kw_lower = {g: [kw.lower() for kw in kws] for g, kws in genre_keywords.items()}

def infer_genres_from_text(text, top_n=3):
    """
    Return list of matched genres (up to top_n). Uses simple substring matching.
    """
    if pd.isna(text) or not str(text).strip():
        return []
    s = str(text).lower()
    matches = []
    for genre, kws in gen_kw_lower.items():
        for kw in kws:
            if kw in s:
                matches.append(genre)
                break
    if not matches:
        return []
    seen = []
    for m in matches:
        if m not in seen:
            seen.append(m)
    return seen[:top_n]


In [None]:
desc_col = "Description"

data['inferred_genres'] = data[desc_col].apply(infer_genres_from_text)
data['primary_genre'] = data['inferred_genres'].apply(lambda x: x[0] if len(x) else "Unknown")

print(data[['Title', desc_col, 'inferred_genres', 'primary_genre']].head(10))
print("\nPrimary genre counts:\n", data['primary_genre'].value_counts().head(20))

                                               Title  \
0                           The Shawshank Redemption   
1                                      The Godfather   
2                                    The Dark Knight   
3                              The Godfather Part II   
4                                       12 Angry Men   
5      The Lord of the Rings: The Return of the King   
6                                   Schindler's List   
7                                       Pulp Fiction   
8  The Lord of the Rings: The Fellowship of the Ring   
9                     The Good, the Bad and the Ugly   

                                         Description   inferred_genres  \
0  A banker convicted of uxoricide forms a friend...                []   
1  The aging patriarch of an organized crime dyna...  [Crime, History]   
2  When a menace known as the Joker wreaks havoc ...                []   
3  The early life and career of Vito Corleone in ...    [Drama, Crime]   
4  The jury i

In [None]:
data

Unnamed: 0,Rank,Title,Year,IMDb Rating,Votes,Runtime,Certificate,Description,Link,min_age,is_16_plus,inferred_genres,primary_genre
0,1,The Shawshank Redemption,1994,9.3,3081689,2h 22m,R,A banker convicted of uxoricide forms a friend...,https://www.imdb.com/title/tt0111161/,17.0,True,[],Unknown
1,2,The Godfather,1972,9.2,2149202,2h 55m,R,The aging patriarch of an organized crime dyna...,https://www.imdb.com/title/tt0068646/,17.0,True,"[Crime, History]",Crime
2,3,The Dark Knight,2008,9.1,3056543,2h 32m,PG-13,When a menace known as the Joker wreaks havoc ...,https://www.imdb.com/title/tt0468569/,10.0,False,[],Unknown
3,4,The Godfather Part II,1974,9.0,1445413,3h 22m,R,The early life and career of Vito Corleone in ...,https://www.imdb.com/title/tt0071562/,17.0,True,"[Drama, Crime]",Drama
4,5,12 Angry Men,1957,9.0,941911,1h 36m,Approved,The jury in a New York City murder trial is fr...,https://www.imdb.com/title/tt0050083/,,False,[Crime],Crime
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,246,Groundhog Day,1993,8.0,722733,1h 41m,PG,"A narcissistic, self-centered weatherman finds...",https://www.imdb.com/title/tt0107048/,10.0,False,[],Unknown
246,247,The Help,2011,8.1,517221,2h 26m,PG-13,An aspiring author during the civil rights mov...,https://www.imdb.com/title/tt1454029/,10.0,False,[],Unknown
247,248,Drishyam,2015,8.2,103138,2h 43m,Not Rated,Desperate measures are taken by a man who trie...,https://www.imdb.com/title/tt4430212/,,False,"[Drama, Crime]",Drama
248,249,Gangs of Wasseypur,2012,8.2,109337,5h 21m,Not Rated,A clash between Sultan and Shahid Khan leads t...,https://www.imdb.com/title/tt1954470/,,False,[],Unknown


In [None]:
import re
def infer_genres_word_boundary(text, top_n=3):
    if pd.isna(text) or not str(text).strip():
        return []
    s = str(text).lower()
    matches = []
    for genre, kws in gen_kw_lower.items():
        for kw in kws:
            if re.search(rf'\b{re.escape(kw)}\b', s):
                matches.append(genre)
                break
    seen = []
    for m in matches:
        if m not in seen:
            seen.append(m)
    return seen[:top_n]



In [None]:
data.to_csv("/content/imdb_top250_enriched.csv", index=False)
print("Saved to /content/imdb_top250_enriched.csv")

Saved to /content/imdb_top250_enriched.csv
