## **Script and Subtitle Scraper for Movies**

### Main Goals:
1. **Scrape Movie Scripts**: Retrieve scripts from IMSDB, Springfield Springfield, and MovieScripts.com for given movie titles.
2. **Scrape Subtitles**: Download and process subtitles from OpenSubtitles and YTS Subtitles.
3. **Process Movie Titles in Chunks**: Handle large lists of movie titles (e.g., from CSV files) in manageable chunks.
4. **Save Results**: Store retrieved scripts and subtitles in separate CSV files for easy reference and analysis.

In [None]:
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import pandas as pd
import time


# 1. IMSDB Scraper: Scrapes movie scripts from IMSDB.
def scrape_imsdb_script(movie_title):
    """Scrape the script from IMSDB for a given movie."""
    formatted_title = movie_title.replace(" ", "-")
    url = f"https://imsdb.com/scripts/{formatted_title}.html"
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            script_div = soup.find('pre')
            if script_div and script_div.get_text(strip=True):
                return script_div.get_text(strip=True)
        return f"Script not found on IMSDB for '{movie_title}'."
    except Exception as e:
        return f"Error fetching script from IMSDB: {e}"


# 2. Springfield Scraper: Scrapes movie scripts from Springfield! website.
def scrape_springfields_script(movie_title):
    """Scrape the script from Springfield! website."""
    formatted_title = movie_title.replace(" ", "-")
    url = f"https://www.springfieldspringfield.co.uk/movie_script.php?movie={formatted_title}"
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            script_div = soup.find('div', class_='scrolling-script-container')
            if script_div and script_div.get_text(strip=True):
                return script_div.get_text(strip=True)
        return f"Script not found on Springfield for '{movie_title}'."
    except Exception as e:
        return f"Error fetching script from Springfield: {e}"


# 3. OpenSubtitles Scraper: Scrapes subtitles from OpenSubtitles.org.
def scrape_opensubtitles(movie_title):
    """Download and return the subtitle text from OpenSubtitles."""
    search_url = f"https://www.opensubtitles.org/en/search2/sublanguageid-eng/moviename-{movie_title.replace(' ', '+')}"
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(search_url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            first_result = soup.find('a', class_='bnone')
            if first_result:
                subtitle_page_link = "https://www.opensubtitles.org" + first_result['href']
                return download_and_extract_subtitle(subtitle_page_link, movie_title)
        return f"Subtitle not found on OpenSubtitles for '{movie_title}'."
    except Exception as e:
        return f"Error fetching subtitle from OpenSubtitles: {e}"


def download_and_extract_subtitle(file_url, movie_title):
    """Download and process a subtitle file."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.get(file_url, headers=headers)
        if response.status_code == 200:
            content_type = response.headers.get('Content-Type', '')
            subtitle_file_name = f"{movie_title}.srt"
            if 'application/zip' in content_type or zipfile.is_zipfile(io.BytesIO(response.content)):
                with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                    srt_files = [file for file in z.namelist() if file.endswith('.srt')]
                    if srt_files:
                        with z.open(srt_files[0]) as extracted_file:
                            with open(subtitle_file_name, 'wb') as f:
                                f.write(extracted_file.read())
                        return f"Subtitle extracted for '{movie_title}'."
            else:
                with open(subtitle_file_name, 'wb') as f:
                    f.write(response.content)
                return f"Subtitle downloaded for '{movie_title}'."
        return "Failed to download subtitle file."
    except Exception as e:
        return f"Error downloading subtitle: {e}"


# 4. Chunk Processing: Processes a list of movies and scrapes data in chunks.
def process_movies_in_chunks(input_file):
    """Process movies in chunks, scraping scripts and subtitles."""
    try:
        df = pd.read_csv(input_file)
        if 'Movie Name' not in df.columns:
            return "Error: 'Movie Name' column not found in the CSV file."

        chunk_size = 10000
        for chunk_num, chunk in enumerate(range(0, len(df), chunk_size)):
            results = []
            for _, row in df.iloc[chunk:chunk + chunk_size].iterrows():
                movie_title = row['Movie Name']
                script_imsdb = scrape_imsdb_script(movie_title)
                script_springfield = scrape_springfields_script(movie_title)
                subtitle = scrape_opensubtitles(movie_title)

                results.append({
                    'Movie Name': movie_title,
                    'Script_IMSDB': script_imsdb,
                    'Script_Springfield': script_springfield,
                    'Subtitle': subtitle
                })

            chunk_df = pd.DataFrame(results)
            chunk_file_name = f"chunk_{chunk_num + 1}.csv"
            chunk_df.to_csv(chunk_file_name, index=False)
    except Exception as e:
        return f"Error processing movies: {e}"


input_file = "movies_names.csv"
process_movies_in_chunks(input_file)


Batch 2 (from 10,001 to 20,000) completed and saved to 'top_movies_scripts_and_subtitles.csv'.


## **Script and Subtitle Availability Analysis and Filtering**

### Main Goals:
1. **Categorize Script Availability**: The function `categorize_script_availability` checks if a movie's script or subtitle is available or not, based on certain text markers like "not available" or "Failed to download".
2. **Count Availability**: It calculates and prints the count of available and unavailable scripts/subtitles.
3. **Identify Movies Without Scripts**: It determines how many movies don't have a script or subtitle available.
4. **Handle Null Values**: It checks for and prints any null values in the dataset.
5. **Filter and Save Data**: It filters out rows with unavailable scripts/subtitles and saves the cleaned data into a new CSV file (`filtered_movies.csv`).
6. **Load and Process Data**: Reads the movie script and subtitle data, assigns appropriate column names, filters, and saves results for further use.

### Key Sections of the Code:
- **`categorize_script_availability`** function: Categorizes whether scripts/subtitles are available.
- **DataFrame cleaning and filtering**: Filters out rows with specific unavailability markers and saves the filtered data.

In [None]:
# Step 1: Load the data
scripts = pd.read_csv('/content/top_movies_scripts_and_subtitles.csv', sep=';')

# Step 2: Define the categorization function for script availability
def categorize_script_availability(text):
    if "not available" in text or "Failed to download" in text:
        return "Not Available"
    else:
        return "Available"

# Step 3: Apply the function to categorize the availability of scripts/subtitles
scripts['Availability'] = scripts['Script/Subtitle Text'].apply(categorize_script_availability)

# Step 4: Count and display the availability of scripts
availability_count = scripts['Availability'].value_counts()
print("\nScript Availability Count:")
print(availability_count)

# Step 5: Count the number of movies without scripts/subtitles available
num_movies_without_scripts = scripts[scripts['Availability'] == 'Not Available'].shape[0]
print(f"\nNumber of movies without scripts: {num_movies_without_scripts}")

# Step 6: Check for NULL values in the dataset
null_values = scripts.isnull().sum()
print("\nNULL Values in Each Column:")
print(null_values)

# Step 7: Create a DataFrame
df = pd.DataFrame(scripts)

# Step 8: Filter out rows where 'Script/Subtitle Text' contains 'Script not available on Springfield'
filtered_df = df[df['Script/Subtitle Text'] != 'Script not available on Springfield']

# Step 9: Display the filtered DataFrame and save it to a new CSV file
print(filtered_df)
filtered_df.to_csv('filtered_movies.csv', index=False)
print("Filtered data saved to 'filtered_movies.csv'")

# Step 10: Load the data again (with no header) for further processing
dataa = pd.read_csv('/content/top_movies_scripts_and_subtitles.csv', sep=';')
df = pd.read_csv('/content/top_movies_scripts_and_subtitles.csv', sep=';', header=None)

# Step 11: Assign appropriate column names to the DataFrame
df.columns = ['Movie Name', 'Script/Subtitle Text']

# Step 12: Display the DataFrame
print(df)

# Step 13: Update column names to include 'Availability'
df.columns = ['Movie Name', 'Script/Subtitle Text', 'Availability']

# Step 14: Drop rows where 'Availability' is 'Not Available'
df_filtered = df[df['Availability'] != 'Not Available']

# Step 15: Display the filtered DataFrame and save it to a new CSV file
print(df_filtered)
df_filtered.to_csv('filtered_data.csv', index=False)
print("Filtered data saved to 'filtered_data.csv'")