In [1]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

BASE_URL = 'https://www.filmyquotes.com'
import random

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
]

HEADERS = {'User-Agent': random.choice(USER_AGENTS)}


In [2]:
import requests
from requests.exceptions import RequestException

def safe_request(url, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            response.raise_for_status()
            return response
        except RequestException as e:
            print(f"Attempt {attempt+1}: Request failed ({e}). Retrying...")
            time.sleep(2)
    return None  # Return None if all retries fail


In [3]:
def get_categories():
    """
    Scrapes and returns categories starting from 'Action' until 'World' and 'Universe'.
    """
    response = requests.get(BASE_URL + '/categories', headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')

    categories = {}
    start_scraping = False

    for link in soup.find_all("a", class_="list-group-item"):
        category_name = link.text.strip()
        href = link.get("href")

        if category_name == "Action":
            start_scraping = True  # Start collecting from "Action"

        if not start_scraping:
            continue  # Skip categories before "Action"

        if href:
            category_url = BASE_URL + href if href.startswith("/categories/") else href
            categories[category_name] = category_url

        if category_name == "World & Universe":
            break  # Stop scraping after "World" or "Universe"

    return categories


In [8]:
import re
from bs4 import BeautifulSoup

def get_total_pages(category_url):
    """
    Determines the total number of pages in a category by extracting 'Page X of Y'
    from <small> inside <div class="ms-4">, using safe_request.
    """
    response = safe_request(category_url)  # Use safe_request instead of requests.get()
    
    if not response or response.status_code != 200:
        print(f"Failed to fetch {category_url}, Status Code: {response.status_code if response else 'No Response'}")
        return 1  # Default to 1 if request fails

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Locate the <small> inside <div class="ms-4">
    ms4_div = soup.find("div", class_="ms-4")
    
    if ms4_div:
        small_tag = ms4_div.find("small")  # Extract the <small> tag
        if small_tag:
            match = re.search(r'Page \d+ of (\d+)', small_tag.text.strip())  # Extract last number
            if match:
                return int(match.group(1))  # Return the highest page number

    return 1  # Default to 1 if no pagination text is found


In [9]:
def scrape_category_dialogues(category_name, category_url):
    """
    Scrapes all dialogues from a single category by dynamically determining the number of pages.
    """
    dialogues_data = []
    total_pages = get_total_pages(category_url)

    for page in range(1, total_pages + 1):
        url = f"{category_url[:-2]}/{page}"
        response = safe_request(url)

        if not response or response.status_code != 200:
            print(f"Failed to fetch {url}, Status Code: {response.status_code if response else 'No Response'}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        card_bodies = soup.find_all("div", class_="card-body")

        for card_body in card_bodies:
            d_flex_divs = card_body.find_all("div", class_="d-flex")

            if len(d_flex_divs) > 1:
                hinglish_text = d_flex_divs[0].get_text(strip=True)
                english_text = d_flex_divs[1].get_text(strip=True)
                dialogues_data.append([category_name, hinglish_text, english_text])

        print(f"Scraped Page {page}/{total_pages} for {category_name}")
        time.sleep(random.uniform(2, 5))

    return dialogues_data

In [10]:
def scrape_all_categories():
    """
    Scrapes dialogues from all categories.
    """
    all_dialogues = []
    categories = get_categories()

    for category_name, category_url in categories.items():
        print(f"Scraping Category: {category_name}...")
        category_data = scrape_category_dialogues(category_name, category_url)
        all_dialogues.extend(category_data)

    return all_dialogues

In [11]:
# Run the scraper
data = scrape_all_categories()

# Save to CSV
df = pd.DataFrame(data, columns=["Category", "Dialogue (Hinglish)", "English Translation"])
df.to_csv("bollywood_dialogues.csv", index=False, encoding="utf-8")

print("Scraping completed! Data saved to bollywood_dialogues.csv")

Scraping Category: Action...
Scraped Page 1/50 for Action
Scraped Page 2/50 for Action
Scraped Page 3/50 for Action
Scraped Page 4/50 for Action
Scraped Page 5/50 for Action
Scraped Page 6/50 for Action
Scraped Page 7/50 for Action
Scraped Page 8/50 for Action
Scraped Page 9/50 for Action
Scraped Page 10/50 for Action
Scraped Page 11/50 for Action
Scraped Page 12/50 for Action
Scraped Page 13/50 for Action
Scraped Page 14/50 for Action
Scraped Page 15/50 for Action
Scraped Page 16/50 for Action
Scraped Page 17/50 for Action
Scraped Page 18/50 for Action
Scraped Page 19/50 for Action
Scraped Page 20/50 for Action
Scraped Page 21/50 for Action
Scraped Page 22/50 for Action
Scraped Page 23/50 for Action
Scraped Page 24/50 for Action
Scraped Page 25/50 for Action
Scraped Page 26/50 for Action
Scraped Page 27/50 for Action
Scraped Page 28/50 for Action
Scraped Page 29/50 for Action
Scraped Page 30/50 for Action
Scraped Page 31/50 for Action
Scraped Page 32/50 for Action
Scraped Page 33/50 f