In [2]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
import nltk
import wikipedia
from lxml import html
import os
import re
import getpass
from google import genai
headers = {
    "User-Agent": "Mozilla/5.0"
}
BASE_URL = "https://en.wikipedia.org/wiki/"

In [None]:
def search_wikipedia(movie_name,year):
    search_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": f"{movie_name}  {year}",
        "format": "json"
    }

    r = requests.get(search_url, params=params, headers=headers,timeout=20)
    data = r.json()
    if data["query"]["search"]:
        return data["query"]["search"][0]["pageid"]
    else:
        return None


# --------------------------
# Get plot from page
# --------------------------
def get_plot_from_pageid(pageid):
    if pageid !=  None:
        try:
            page = wikipedia.WikipediaPage(pageid=pageid)
            text = page.section(section_title="Plot")
            return text
        except wikipedia.DisambiguationError as e:
            print("error occured",e)
            return ""
    else:
        print("No page exists for pageid",pageid)
        return ""









In [8]:
import pandas as pd
import time
import os

df_cast = pd.read_csv("malayalam_movie_cast_dataset.csv")
df_movies = df_cast[["movie_name", "year"]].drop_duplicates()

output_file = "movies_with_plot.csv"

if not os.path.exists(output_file):
    pd.DataFrame(columns=["movie_name", "year", "plot"]).to_csv(output_file, index=False)

done = set(pd.read_csv(output_file)["movie_name"])

for _, row in df_movies.iterrows():
    movie = row["movie_name"]
    year = row["year"]

    if movie in done:
        continue

    print(f"Processing: {movie}")

    try:
        page_id = search_wikipedia(movie, year)   # must use timeout inside
        plot = get_plot_from_pageid(page_id)      # must use timeout inside

    except Exception as e:
        print("Skipping", movie, "due to", e)
        plot = ""

    pd.DataFrame([{
        "movie_name": movie,
        "year": year,
        "plot": plot
    }]).to_csv(output_file, mode="a", header=False, index=False)

    time.sleep(1)

print("✅ Done")


Processing: Marupadi
Processing: January
Processing: April
Processing: Fukri
Processing: Oru Mexican Aparatha
Processing: C/O Saira Banu
Processing: Pareeth Pandari
Processing: Take Off
Processing: Kambhoji
Processing: Rakshadhikari Baiju Oppu
Processing: Vedham
Processing: Adventures of Omanakuttan
Processing: Godha
Processing: Gold Coins
Processing: Ma Chu Ka
Processing: Avarude Raavukal
Processing: Viswasapoorvam Mansoor
Processing: Thondimuthalum Driksakshiyum
Processing: Tiyaan
Processing: Sunday Holiday
Processing: Minnaminungu
Processing: Team 5
Processing: Theeram
Processing: Sarvopari Palakkaran
Processing: Thrissivaperoor Kliptham
Processing: E
Processing: Honey Bee 2.5
Processing: Naval Enna Jewel
Processing: Njandukalude Nattil Oridavela
Processing: Pullikkaran Staraa
Processing: Matchbox
Processing: Udaharanam Sujatha
Processing: Tharangam
Processing: Kaattu
Processing: Vishwa Vikhyatharaya Payyanmar
Processing: Zacharia Pothen Jeevichirippundu
Processing: Paathi
Processin

In [7]:
import pandas as pd

df = pd.read_csv("movies_with_plot.csv")

# keep plot column
df = df[["movie_name", "year", "plot"]].drop_duplicates()

empty_count = 0
total = 0

for i, row in df.iterrows():
    total += 1
    plot = str(row["plot"]).strip()   # convert NaN to string and strip spaces

    if len(plot) < 5:
        empty_count += 1
        # print("Empty plot:", row["movie_name"])

print("Total movies with empty plots:", empty_count, "among", total)


Total movies with empty plots: 72 among 689


In [3]:
# searching for remaining plots via OMDb API
OMDB_KEY = os.getenv("OMDB_KEY")
def get_plot_from_OMDb(movie,year):
    params = {
        "apikey": OMDB_KEY,
        "t": movie,      # use title, not search
        "y": year,
        "plot": "full"
    }
    r = requests.get("http://www.omdbapi.com/", params=params, timeout=20)
    data = r.json()
    # debug (optional)
    # print(data)
    if data.get("Response") == "True":
        return data.get("Plot", "")
    else:
        print("Not found:", movie, data.get("Error"))
        return ""





df = pd.read_csv("movies_with_plot.csv")
df["plot"] = df["plot"].fillna("")

for i, row in df.iterrows():

    if row["plot"].strip() != "":
        continue

    movie = row["movie_name"]
    year = row["year"]

    print(f"Fetching plot for: {movie} ({year})")

    try:
        plot = get_plot_from_OMDb(movie, year)
    except Exception as e:
        print("Error for", movie, ":", e)
        plot = ""

    df.at[i, "plot"] = plot
    df.to_csv("movies_with_plot.csv", index=False)

    time.sleep(1)

print("✅ CSV updated with missing plots")


Fetching plot for: Avan (2010)
Fetching plot for: Taskkara Lahala (2010)
Not found: Taskkara Lahala Movie not found!
Fetching plot for: Raama Raavanan (2010)
Fetching plot for: Nirakazhcha (2010)
Not found: Nirakazhcha Movie not found!
Fetching plot for: Chithrakuzhal (2010)
Not found: Chithrakuzhal Movie not found!
Fetching plot for: Kanyakumari Express (2010)
Fetching plot for: Tournament (2010)
Fetching plot for: Puthumukhangal (2010)
Fetching plot for: August (2010)
Fetching plot for: The Metro (2011)
Fetching plot for: Doubles (2011)
Fetching plot for: Lucky Jokers (2011)
Fetching plot for: Melvilasom (2011)
Not found: Melvilasom Movie not found!
Fetching plot for: Kanakompathu (2011)
Not found: Kanakompathu Movie not found!
Fetching plot for: Mullassery Madhavan Kutty Nemom P. O. (2012)
Fetching plot for: Unnam (2012)
Fetching plot for: Father's Day (2012)
Fetching plot for: Thalsamayam Oru Penkutty (2012)
Not found: Thalsamayam Oru Penkutty Movie not found!
Fetching plot for: Ou

In [4]:
import pandas as pd

df = pd.read_csv("movies_with_plot.csv")

df = df.sort_values(by="year")

df.to_csv("movies_with_plot.csv", index=False)

print("✅ CSV sorted by year")


✅ CSV sorted by year


In [8]:

df = pd.read_csv("movies_with_plot.csv")

df = df.drop_duplicates(subset=["movie_name", "year"])

df.to_csv("movies_with_plot.csv", index=False)

print("✅ Kept only unique movie-year combinations")


✅ Kept only unique movie-year combinations
