In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
import nltk
import wikipedia
from lxml import html
import os
import re
import getpass
from google import genai
headers = {
    "User-Agent": "Mozilla/5.0"
}
BASE_URL = "https://en.wikipedia.org/wiki/"

In [2]:
def search_wikipedia(movie_name,year):
    search_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": f"{movie_name}  {year}",
        "format": "json"
    }

    r = requests.get(search_url, params=params, headers=headers,timeout=20)
    data = r.json()
    if data["query"]["search"]:
        return data["query"]["search"][0]["pageid"]
    else:
        return None


# --------------------------
# Get plot from page
# --------------------------
def get_plot_from_pageid(pageid):
    if pageid !=  None:
        try:
            page = wikipedia.WikipediaPage(pageid=pageid)
            text = page.section(section_title="Plot")
            return text
        except wikipedia.DisambiguationError as e:
            print("error occured",e)
            return ""
    else:
        print("No page exists for pageid",pageid)
        return ""









In [3]:
import pandas as pd
import time
import os

df_cast = pd.read_csv("malayalam_movie_cast_dataset.csv")
df_movies = df_cast[["movie_name", "year"]].drop_duplicates()

output_file = "movies_with_plot.csv"

if not os.path.exists(output_file):
    pd.DataFrame(columns=["movie_name", "year", "plot"]).to_csv(output_file, index=False)

done = set(pd.read_csv(output_file)["movie_name"])

for _, row in df_movies.iterrows():
    movie = row["movie_name"]
    year = row["year"]

    if movie in done:
        continue

    print(f"Processing: {movie}")

    try:
        page_id = search_wikipedia(movie, year)   # must use timeout inside
        plot = get_plot_from_pageid(page_id)      # must use timeout inside

    except Exception as e:
        print("Skipping", movie, "due to", e)
        plot = ""

    pd.DataFrame([{
        "movie_name": movie,
        "year": year,
        "plot": plot
    }]).to_csv(output_file, mode="a", header=False, index=False)

    time.sleep(1)

print("✅ Done")


✅ Done


In [4]:
import pandas as pd

df = pd.read_csv("movies_with_plot.csv")

# keep plot column
df = df[["movie_name", "year", "plot"]].drop_duplicates()

empty_count = 0
total = 0

for i, row in df.iterrows():
    total += 1
    plot = str(row["plot"]).strip()   # convert NaN to string and strip spaces

    if len(plot) < 5:
        empty_count += 1
        # print("Empty plot:", row["movie_name"])

print("Total movies with empty plots:", empty_count, "among", total)


Total movies with empty plots: 0 among 597


In [5]:
# searching for remaining plots via OMDb API
OMDB_KEY = os.getenv("OMDB_KEY")
def get_plot_from_OMDb(movie,year):
    params = {
        "apikey": OMDB_KEY,
        "t": movie,      # use title, not search
        "y": year,
        "plot": "full"
    }
    r = requests.get("http://www.omdbapi.com/", params=params, timeout=20)
    data = r.json()
    # debug (optional)
    # print(data)
    if data.get("Response") == "True":
        return data.get("Plot", "")
    else:
        print("Not found:", movie, data.get("Error"))
        return ""





df = pd.read_csv("movies_with_plot.csv")
df["plot"] = df["plot"].fillna("")

for i, row in df.iterrows():

    if row["plot"].strip() != "":
        continue

    movie = row["movie_name"]
    year = row["year"]

    print(f"Fetching plot for: {movie} ({year})")

    try:
        plot = get_plot_from_OMDb(movie, year)
    except Exception as e:
        print("Error for", movie, ":", e)
        plot = ""

    df.at[i, "plot"] = plot
    df.to_csv("movies_with_plot.csv", index=False)

    time.sleep(1)

print("✅ CSV updated with missing plots")


✅ CSV updated with missing plots


In [6]:
import pandas as pd

df = pd.read_csv("movies_with_plot.csv")

df = df.sort_values(by="year")

df.to_csv("movies_with_plot.csv", index=False)

print("✅ CSV sorted by year")


✅ CSV sorted by year


In [7]:

df = pd.read_csv("movies_with_plot.csv")

df = df.drop_duplicates(subset=["movie_name", "year"])

df.to_csv("movies_with_plot.csv", index=False)

print("✅ Kept only unique movie-year combinations")


✅ Kept only unique movie-year combinations


In [8]:
#cleaning data by removing empty movies
import pandas as pd

# Load both files
movies_df = pd.read_csv("movies_with_plot.csv")        # movie_name, plot
roles_df = pd.read_csv("malayalam_movie_cast_dataset.csv")          # movie_name, character_name, actor_name

# 1. Keep only movies that have valid plot
movies_df["plot"] = movies_df["plot"].fillna("").str.strip()
movies_with_plot = movies_df[movies_df["plot"].str.len() > 50]

# 2. Get list of valid movie names
valid_movies = set(movies_with_plot["movie_name"])

# 3. Filter roles file using valid movies
roles_clean = roles_df[roles_df["movie_name"].isin(valid_movies)]

# 4. Save cleaned files
movies_with_plot.to_csv("movies_with_plot.csv", index=False)
roles_clean.to_csv("malayalam_movie_cast_dataset.csv", index=False)

print("Cleaning done:")
print("Movies with plot:", len(movies_with_plot))
print("Roles after filtering:", len(roles_clean))


Cleaning done:
Movies with plot: 597
Roles after filtering: 9297
