In [27]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
import nltk
import re

In [28]:
nltk.download("punkt")
nltk.download("punkt_tab")

BASE_WIKI_URL = "https://en.wikipedia.org"
START_YEAR = 2010
END_YEAR = 2024

data_rows = []

headers = {
    "User-Agent": "Mozilla/5.0"
}


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jerar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jerar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Fetch HTML

In [29]:
def get_soup(url):
    response = requests.get(url, headers=headers)
    return BeautifulSoup(response.text, "html.parser")

Extract movies for a year

In [30]:
def clean_english_title(text):
    # Remove anything inside parentheses
    text = re.sub(r"\(.*?\)", "", text)
    # Keep only ASCII (English) characters
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    return text.strip()


def get_movies_for_year(year):
    url = f"{BASE_WIKI_URL}/wiki/List_of_Malayalam_films_of_{year}"
    soup = get_soup(url)

    movies = []

    tables = soup.find_all("table", class_="wikitable")
    for table in tables:
        for row in table.find_all("tr")[1:]:
            cols = row.find_all("td")
            if cols:
                title_cell = cols[0]
                raw_title = title_cell.get_text(strip=True)

                #  CLEANING HERE
                title = clean_english_title(raw_title)

                link_tag = title_cell.find("a")
                if link_tag and link_tag.get("href"):
                    link = BASE_WIKI_URL + link_tag["href"]
                    if title:  # avoid empty titles
                        movies.append((title, link))

    return movies


Extract plot

In [31]:
def extract_plot(soup):
    plot_span = soup.find("span", id="Plot")
    if plot_span:
        paragraphs = []
        for sib in plot_span.parent.find_next_siblings():
            if sib.name == "p":
                paragraphs.append(sib.get_text())
            elif sib.name == "h2":
                break
        return " ".join(paragraphs)
    return ""


Extract cast

In [32]:
def extract_cast(soup):
    cast_pairs = []

    for ul in soup.find_all("ul"):
        for li in ul.find_all("li"):
            text = li.get_text()
            if " as " in text:
                parts = text.split(" as ", 1)
                actor = parts[0].strip()
                character = parts[1].strip()
                cast_pairs.append((actor, character))

    return cast_pairs

Generate character description

In [33]:
def generate_character_description(plot, character_name):
    sentences = sent_tokenize(plot)
    relevant = [
        s for s in sentences
        if character_name.split()[0].lower() in s.lower()
    ]
    return " ".join(relevant[:3])

Main scraping loop

In [34]:
for year in range(START_YEAR, END_YEAR + 1):
    print(f"Scraping movies for year {year}...")
    movies = get_movies_for_year(year)

    for title, link in movies:
        try:
            soup = get_soup(link)
            plot = extract_plot(soup)
            cast = extract_cast(soup)

            for actor, character in cast:
                char_desc = generate_character_description(plot, character)

                data_rows.append({
                    "movie_name": title,
                    "year": year,
                    "plot": plot,
                    "actor_name": actor,
                    "character_name": character,
                    "character_description": char_desc
                })

            time.sleep(1)

        except Exception as e:
            print(f"Failed for {title}: {e}")

Scraping movies for year 2010...
Scraping movies for year 2011...
Scraping movies for year 2012...
Scraping movies for year 2013...
Scraping movies for year 2014...
Scraping movies for year 2015...
Scraping movies for year 2016...
Scraping movies for year 2017...
Scraping movies for year 2018...
Scraping movies for year 2019...
Scraping movies for year 2020...
Scraping movies for year 2021...
Scraping movies for year 2022...
Scraping movies for year 2023...
Scraping movies for year 2024...


Save dataset

In [36]:
df = pd.DataFrame(data_rows)
invalid_mask = (df["actor_name"] == "Download")
df = df[~invalid_mask].reset_index(drop=True)
df.to_csv("malayalam_movie_cast_dataset.csv", index=False)


In [38]:
print(len(df["movie_name"].unique()))

667
