## Data Gathering & Extraction (Most Popular Movie in Romance Genre)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL for the AJAX requests to load more reviews
base_url = "https://www.imdb.com/title/tt8740790/reviews/_ajax"

# Headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Initialize empty lists to store data from web scraping
review_titles = []
review_contents = []
ratings = []

# Function to extract reviews from the current page
def extract_reviews(soup):
    review_data = soup.find_all('div', {"class": "review-container"})
    for row in review_data:
        # Extract review title data
        review_title_element = row.find("a", {"class": "title"})
        if review_title_element:
            review_titles.append(review_title_element.text.strip())

        # Extract review content data
        review_content_element = row.find("div", {"class": "text show-more__control"})
        if review_content_element:
            review_contents.append(review_content_element.text.strip())
            
        # Extract rating score (assuming the structure is consistent)
        rating_element = row.find("span", class_="rating-other-user-rating")  # Target the relevant element
        if rating_element:
            score_element = rating_element.find("span", string=lambda text: text and text.isdigit())
            if score_element:
                ratings.append(score_element.text.strip())
            else:
                ratings.append(None)  # Append None if rating not found within the element
        else:
            ratings.append(None)  # Append None if the entire rating element is not found

# Initial request to get the first page
response = requests.get(base_url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, "html.parser")
extract_reviews(soup)

# Find the pagination key for the next page
load_more_data = soup.find("div", {"class": "load-more-data"})
pagination_key = load_more_data["data-key"] if load_more_data else None

# Loop through subsequent pages until there are no more pages
while pagination_key:
    try:
        response = requests.get(base_url, headers=headers, params={"paginationKey": pagination_key}, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        extract_reviews(soup)
        load_more_data = soup.find("div", {"class": "load-more-data"})
        pagination_key = load_more_data["data-key"] if load_more_data else None
        time.sleep(3)  # Adding a delay to avoid overwhelming the server
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        time.sleep(5)  # Wait before retrying
        continue

# Create the DataFrame
df = pd.DataFrame({'Title': review_titles, 'Review': review_contents, 'Rating Score': ratings,})

# Specify the filename
file_name = 'romance_imdb_reviews.csv'

# Save the DataFrame to a CSV file
df.to_csv(file_name, index=False)  # Avoid including the index column

print("DataFrame saved to", file_name)


DataFrame saved to romance_imdb_reviews.csv
