## Step 1: Setup Environment
Install required packages and set up PySpark for Google Colab.

In [None]:
# Install required packages
# PySpark is the main engine for data processing
# tenacity provides retry logic for API calls
!pip install pyspark tenacity -q

print("‚úÖ Packages installed successfully!")

In [None]:
# Import all required libraries
import os
import json
import requests
import logging
from pathlib import Path
from typing import List, Dict, Optional, Any
from datetime import datetime

# Retry logic for API calls
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

# PySpark imports for data processing
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql.functions import (
    col, when, lit, array_join, transform, expr,
    to_date, filter as array_filter, element_at,
    size, concat_ws, slice as array_slice,
    desc, asc, year, count, sum, mean, avg
)
from pyspark.sql.types import DoubleType, IntegerType

# Visualization
import matplotlib.pyplot as plt
import pandas as pd

print("‚úÖ All libraries imported successfully!")

In [None]:
# Setup logging for better visibility into what's happening
def setup_logger(name: str) -> logging.Logger:
    """
    Creates a logger with console output.
    
    Args:
        name: Name for the logger (usually __name__)
    
    Returns:
        Configured logger instance
    """
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    
    # Avoid duplicate handlers if cell is re-run
    if not logger.handlers:
        handler = logging.StreamHandler()
        handler.setLevel(logging.INFO)
        formatter = logging.Formatter('%(levelname)s: %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    
    return logger

logger = setup_logger(__name__)
print("‚úÖ Logger configured!")

In [None]:
# ‚ö†Ô∏è IMPORTANT: Enter your TMDB API key here!
# Get your free API key at: https://www.themoviedb.org/settings/api

TMDB_API_KEY = ""  # <-- Paste your API key between the quotes

# Validate the API key is set
if not TMDB_API_KEY:
    print("‚ùå ERROR: Please set your TMDB_API_KEY in the cell above!")
    print("   Get a free key at: https://www.themoviedb.org/settings/api")
else:
    print(f"‚úÖ API Key configured (starts with: {TMDB_API_KEY[:4]}...)")

In [None]:
# Create data directories for storing raw and processed data
DATA_DIR = Path("data")
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"

# Create directories if they don't exist
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print(f"‚úÖ Data directories created:")
print(f"   Raw data: {RAW_DIR}")
print(f"   Processed data: {PROCESSED_DIR}")

In [None]:
# Initialize PySpark Session
# local[*] means use all available CPU cores
spark = SparkSession.builder \
    .appName("TMDB_Movie_Analysis") \
    .master("local[*]") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Reduce Spark's verbose logging
spark.sparkContext.setLogLevel("WARN")

print(f"‚úÖ Spark Session Created!")
print(f"   App Name: {spark.sparkContext.appName}")
print(f"   Spark Version: {spark.version}")

---
## Step 2: Fetch Data from TMDB API

We'll fetch movie details including credits (cast & crew) from the TMDB API. The fetcher includes:
- **Retry logic** - Automatically retries failed requests
- **Rate limiting** - Respects API limits
- **Error handling** - Gracefully handles missing movies

In [None]:
class TMDBFetcher:
    """
    A robust fetcher for TMDB API with retry logic and logging.
    Fetches movie details including cast and crew information.
    """
    
    BASE_URL = "https://api.themoviedb.org/3"

    def __init__(self, api_key: str):
        """
        Initialize the fetcher with an API key.
        
        Args:
            api_key: Your TMDB API key
        """
        self.api_key = api_key
        self.logger = setup_logger("TMDBFetcher")
        
        if not self.api_key:
            raise ValueError("TMDB_API_KEY is required!")
        
        # Use a session for connection pooling (faster)
        self.session = requests.Session()

    @retry(
        stop=stop_after_attempt(5),           # Try up to 5 times
        wait=wait_exponential(multiplier=1, min=2, max=10),  # Exponential backoff
        retry=retry_if_exception_type((requests.exceptions.RequestException,)),
        reraise=True
    )
    def fetch_movie_details(self, movie_id: int) -> Optional[Dict[str, Any]]:
        """
        Fetch detailed information for a single movie, including credits.
        
        Args:
            movie_id: The TMDB movie ID
            
        Returns:
            Dictionary with movie details, or None if not found
        """
        # Append credits to get cast/crew in one request
        url = f"{self.BASE_URL}/movie/{movie_id}?api_key={self.api_key}&append_to_response=credits"
        
        response = self.session.get(url, timeout=10)
        
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 404:
            self.logger.warning(f"Movie ID {movie_id} not found.")
            return None
        elif response.status_code == 429:
            self.logger.warning("Rate limited! Waiting before retry...")
            raise requests.exceptions.RequestException("Rate limited")
        else:
            self.logger.error(f"Error {response.status_code} for movie {movie_id}")
            response.raise_for_status()

    def fetch_multiple_movies(self, movie_ids: List[int]) -> List[Dict[str, Any]]:
        """
        Fetch details for multiple movies.
        
        Args:
            movie_ids: List of TMDB movie IDs
            
        Returns:
            List of movie detail dictionaries
        """
        movies = []
        total = len(movie_ids)
        
        for i, movie_id in enumerate(movie_ids):
            try:
                # Skip invalid IDs (like 0)
                if movie_id <= 0:
                    continue
                    
                self.logger.info(f"Fetching movie ID: {movie_id}")
                movie = self.fetch_movie_details(movie_id)
                
                if movie:
                    movies.append(movie)
                    
                self.logger.info(f"Progress: {i+1}/{total} completed.")
                
            except Exception as e:
                self.logger.error(f"Failed to fetch movie {movie_id}: {e}")
                continue
        
        return movies

    @staticmethod
    def save_to_json(data: List[Dict], filepath: Path) -> None:
        """
        Save movie data to a JSON file.
        
        Args:
            data: List of movie dictionaries
            filepath: Path to save the JSON file
        """
        filepath.parent.mkdir(parents=True, exist_ok=True)
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        print(f"‚úÖ Saved {len(data)} movies to {filepath}")

print("‚úÖ TMDBFetcher class defined!")

In [None]:
# Define which movies to fetch
# These are popular/high-grossing movie IDs from TMDB
MOVIE_IDS = [
    299534,  # Avengers: Endgame
    19995,   # Avatar
    140607,  # Star Wars: The Force Awakens
    299536,  # Avengers: Infinity War
    597,     # Titanic
    135397,  # Jurassic World
    420818,  # The Lion King (2019)
    24428,   # The Avengers
    168259,  # Furious 7
    99861,   # Avengers: Age of Ultron
    284054,  # Black Panther
    12445,   # Harry Potter and the Deathly Hallows Part 2
    181808,  # Star Wars: The Last Jedi
    330457,  # Frozen II
    351286,  # Jurassic World: Fallen Kingdom
    109445,  # Frozen
    321612,  # Beauty and the Beast (2017)
    260513,  # Incredibles 2
]

print(f"üìã Will fetch {len(MOVIE_IDS)} movies")

In [None]:
# Fetch movie data from TMDB API
# This cell makes API calls - run only when needed!

RAW_FILE = RAW_DIR / "movies.json"

# Check if we already have the data
if RAW_FILE.exists():
    print(f"‚ÑπÔ∏è Raw data already exists at {RAW_FILE}")
    print("   Delete the file and re-run to fetch fresh data.")
else:
    # Initialize fetcher and download data
    fetcher = TMDBFetcher(api_key=TMDB_API_KEY)
    movies_data = fetcher.fetch_multiple_movies(MOVIE_IDS)
    
    # Save to JSON file
    TMDBFetcher.save_to_json(movies_data, RAW_FILE)
    
    print(f"\nüé¨ Fetched {len(movies_data)} movies successfully!")

In [None]:
# Quick peek at the raw data structure
with open(RAW_FILE, 'r') as f:
    sample = json.load(f)

print(f"üìä Raw Data Summary:")
print(f"   Total movies: {len(sample)}")
print(f"   Sample movie: {sample[0]['title']}")
print(f"   Available fields: {list(sample[0].keys())[:10]}...")

---
## Step 3: Process Data with PySpark

Transform the raw JSON into a clean, analytics-ready Parquet format:
- **Flatten nested structures** (genres, credits, production companies)
- **Clean data types** (dates, numbers)
- **Calculate metrics** (ROI, profit in millions)
- **Extract features** (director, top cast members)

In [None]:
def process_movie_data(input_path: str, output_path: str, spark: SparkSession) -> DataFrame:
    """
    Process raw movie JSON data into clean Parquet format.
    
    This function:
    1. Loads the raw JSON data
    2. Flattens nested structures (genres, credits, etc.)
    3. Cleans and converts data types
    4. Calculates derived metrics (ROI, profit)
    5. Extracts director and cast information
    6. Saves as optimized Parquet format
    
    Args:
        input_path: Path to raw JSON file
        output_path: Path to save Parquet output
        spark: Active SparkSession
        
    Returns:
        Processed DataFrame
    """
    logger.info(f"Loading data from {input_path}")
    
    # Load JSON (multiline=True because it's a JSON array, not JSON Lines)
    df = spark.read.option("multiline", "true").json(input_path)
    logger.info(f"Loaded {df.count()} records")
    
    # --- STEP 1: Drop irrelevant columns ---
    drop_cols = ['adult', 'imdb_id', 'original_title', 'video', 'homepage']
    df = df.drop(*drop_cols)
    
    # --- STEP 2: Flatten nested structures ---
    
    # Genres: Array<Struct<id, name>> -> "Action|Adventure|Sci-Fi"
    if "genres" in df.columns:
        df = df.withColumn("genres", 
            array_join(transform("genres", lambda x: x["name"]), "|"))
    
    # Collection: Struct<name> -> "Avengers Collection"
    if "belongs_to_collection" in df.columns:
        df = df.withColumn("belongs_to_collection", 
            col("belongs_to_collection.name"))
    
    # Production info: Array<Struct<name>> -> "Company1|Company2"
    for c in ["production_countries", "production_companies", "spoken_languages"]:
        if c in df.columns:
            df = df.withColumn(c, 
                array_join(transform(c, lambda x: x["name"]), "|"))
    
    # --- STEP 3: Clean numeric columns ---
    
    # Convert budget/revenue to Double, replace 0 with NULL
    for c in ["budget", "revenue"]:
        if c in df.columns:
            df = df.withColumn(c, col(c).cast(DoubleType()))
            df = df.withColumn(c, when(col(c) == 0, None).otherwise(col(c)))
    
    # Convert popularity to Double
    if "popularity" in df.columns:
        df = df.withColumn("popularity", col("popularity").cast(DoubleType()))
    
    # Parse release date
    if "release_date" in df.columns:
        df = df.withColumn("release_date", to_date(col("release_date")))
    
    # --- STEP 4: Calculate financial metrics ---
    
    # Budget and Revenue in millions USD (easier to read)
    if "budget" in df.columns:
        df = df.withColumn("budget_musd", col("budget") / 1_000_000)
    if "revenue" in df.columns:
        df = df.withColumn("revenue_musd", col("revenue") / 1_000_000)
    
    # Profit = Revenue - Budget (in millions)
    # ROI = Revenue / Budget (return on investment)
    if "budget_musd" in df.columns and "revenue_musd" in df.columns:
        df = df.withColumn("profit_musd", 
            col("revenue_musd") - col("budget_musd"))
        df = df.withColumn("roi", 
            when(col("budget_musd") > 0, col("revenue_musd") / col("budget_musd"))
            .otherwise(0))
    
    # Drop original budget/revenue (keep the _musd versions)
    df = df.drop("budget", "revenue")
    
    # --- STEP 5: Filter data ---
    
    # Keep only released movies
    if "status" in df.columns:
        df = df.filter(col("status") == "Released").drop("status")
    
    # Remove duplicates and nulls
    df = df.dropDuplicates(["id"])
    df = df.na.drop(subset=["id", "title"])
    
    # --- STEP 6: Extract credits information ---
    
    if "credits" in df.columns:
        # Count cast and crew size
        df = df.withColumn("cast_size", size(col("credits.cast")))
        df = df.withColumn("crew_size", size(col("credits.crew")))
        
        # Extract director (first crew member with job='Director')
        df = df.withColumn("director",
            element_at(
                transform(
                    array_filter(col("credits.crew"), lambda c: c["job"] == "Director"),
                    lambda x: x["name"]
                ),
                1  # Get first element
            ))
        
        # Extract top 5 cast members
        df = df.withColumn("cast",
            array_join(
                transform(
                    array_slice(col("credits.cast"), 1, 5),
                    lambda x: x["name"]
                ),
                "|"
            ))
        
        # Drop the large credits struct
        df = df.drop("credits")
    
    # --- STEP 7: Clean text fields ---
    
    for c in ["overview", "tagline"]:
        if c in df.columns:
            df = df.withColumn(c, 
                when((col(c) == "") | (col(c) == "No Data"), None)
                .otherwise(col(c)))
    
    # --- STEP 8: Save as Parquet ---
    
    logger.info(f"Writing processed data to {output_path}")
    df.coalesce(1).write.mode("overwrite").parquet(output_path)
    logger.info("‚úÖ Processing complete!")
    
    return df

print("‚úÖ process_movie_data function defined!")

In [None]:
# Run the data processing pipeline
RAW_PATH = str(RAW_DIR / "movies.json")
PROCESSED_PATH = str(PROCESSED_DIR / "movies.parquet")

# Process the data
processed_df = process_movie_data(RAW_PATH, PROCESSED_PATH, spark)

print(f"\nüìä Processed Data Summary:")
print(f"   Records: {processed_df.count()}")
print(f"   Columns: {len(processed_df.columns)}")

In [None]:
# View the schema of processed data
print("üìã Processed Data Schema:")
processed_df.printSchema()

In [None]:
# Preview the processed data
print("üé¨ Sample Processed Movies:")
processed_df.select("title", "genres", "director", "budget_musd", "revenue_musd", "roi").show(5, truncate=False)

---
## Step 4: Analyze Data

Now let's run analytical queries on our processed data to extract insights.

In [None]:
class MovieAnalyzer:
    """
    Analyzes movie data using PySpark SQL operations.
    Returns results as Pandas DataFrames for easy visualization.
    """

    def __init__(self, spark: SparkSession):
        """
        Initialize analyzer with a Spark session.
        
        Args:
            spark: Active SparkSession
        """
        self.spark = spark
        self.df: Optional[DataFrame] = None

    def load_data(self, path: str) -> None:
        """
        Load processed Parquet data.
        
        Args:
            path: Path to Parquet file/directory
        """
        logger.info(f"Loading data from {path}")
        self.df = self.spark.read.parquet(path)
        
        # Add release year for time-based analysis
        if "release_date" in self.df.columns:
            self.df = self.df.withColumn("release_year", year(col("release_date")))
        
        # Cache for faster repeated queries
        self.df.cache()
        logger.info(f"Loaded {self.df.count()} movies")

    def get_top_by_revenue(self, n: int = 10) -> pd.DataFrame:
        """
        Get top N movies by revenue.
        
        Args:
            n: Number of movies to return
            
        Returns:
            Pandas DataFrame with top movies
        """
        return self.df \
            .select("title", "revenue_musd", "budget_musd", "profit_musd", "release_year") \
            .orderBy(desc("revenue_musd")) \
            .limit(n) \
            .toPandas()

    def get_top_by_roi(self, n: int = 10) -> pd.DataFrame:
        """
        Get top N movies by return on investment.
        
        Args:
            n: Number of movies to return
            
        Returns:
            Pandas DataFrame with top ROI movies
        """
        return self.df \
            .filter(col("roi") > 0) \
            .select("title", "roi", "budget_musd", "revenue_musd") \
            .orderBy(desc("roi")) \
            .limit(n) \
            .toPandas()

    def get_genre_stats(self) -> pd.DataFrame:
        """
        Get average metrics by primary genre.
        
        Returns:
            Pandas DataFrame with genre statistics
        """
        # Extract primary genre (first in the list)
        df_with_genre = self.df.withColumn(
            "primary_genre",
            element_at(transform(col("genres"), lambda x: x), 1)
        )
        
        return df_with_genre \
            .groupBy("primary_genre") \
            .agg(
                count("*").alias("movie_count"),
                avg("revenue_musd").alias("avg_revenue_musd"),
                avg("budget_musd").alias("avg_budget_musd"),
                avg("roi").alias("avg_roi")
            ) \
            .orderBy(desc("avg_revenue_musd")) \
            .toPandas()

    def get_director_stats(self) -> pd.DataFrame:
        """
        Get statistics by director.
        
        Returns:
            Pandas DataFrame with director statistics
        """
        return self.df \
            .filter(col("director").isNotNull()) \
            .groupBy("director") \
            .agg(
                count("*").alias("movie_count"),
                sum("revenue_musd").alias("total_revenue_musd"),
                avg("roi").alias("avg_roi")
            ) \
            .orderBy(desc("total_revenue_musd")) \
            .toPandas()

    def get_franchise_stats(self) -> pd.DataFrame:
        """
        Analyze movie franchises/collections.
        
        Returns:
            Pandas DataFrame with franchise statistics
        """
        return self.df \
            .filter(col("belongs_to_collection").isNotNull()) \
            .groupBy("belongs_to_collection") \
            .agg(
                count("*").alias("movie_count"),
                sum("revenue_musd").alias("total_revenue_musd"),
                avg("roi").alias("avg_roi")
            ) \
            .orderBy(desc("total_revenue_musd")) \
            .toPandas()

    def search_by_actor(self, actor_name: str) -> pd.DataFrame:
        """
        Find movies featuring a specific actor.
        
        Args:
            actor_name: Name to search for (case-insensitive)
            
        Returns:
            Pandas DataFrame with matching movies
        """
        return self.df \
            .filter(col("cast").contains(actor_name)) \
            .select("title", "cast", "genres", "revenue_musd", "release_year") \
            .orderBy(desc("revenue_musd")) \
            .toPandas()

print("‚úÖ MovieAnalyzer class defined!")

In [None]:
# Initialize analyzer and load processed data
analyzer = MovieAnalyzer(spark)
analyzer.load_data(PROCESSED_PATH)

print("‚úÖ Analyzer ready!")

In [None]:
# Analysis 1: Top Movies by Revenue
print("üí∞ TOP 10 MOVIES BY REVENUE")
print("=" * 60)
top_revenue = analyzer.get_top_by_revenue(10)
display(top_revenue)

In [None]:
# Analysis 2: Top Movies by ROI (Return on Investment)
print("üìà TOP 10 MOVIES BY ROI")
print("=" * 60)
top_roi = analyzer.get_top_by_roi(10)
display(top_roi)

In [None]:
# Analysis 3: Director Statistics
print("üé¨ DIRECTOR STATISTICS")
print("=" * 60)
director_stats = analyzer.get_director_stats()
display(director_stats)

In [None]:
# Analysis 4: Franchise/Collection Analysis
print("üé≠ FRANCHISE ANALYSIS")
print("=" * 60)
franchise_stats = analyzer.get_franchise_stats()
display(franchise_stats)

In [None]:
# Analysis 5: Search for movies by actor
actor_name = "Robert Downey Jr."  # Try different actors!
print(f"üîç MOVIES FEATURING: {actor_name}")
print("=" * 60)
actor_movies = analyzer.search_by_actor(actor_name)
display(actor_movies)

---
## Step 5: Visualizations

Create charts to visualize our findings.

In [None]:
# Set up matplotlib style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

In [None]:
# Chart 1: Top 10 Movies by Revenue
fig, ax = plt.subplots(figsize=(12, 6))

# Get data and sort
data = top_revenue.sort_values('revenue_musd', ascending=True)

# Create horizontal bar chart
bars = ax.barh(data['title'], data['revenue_musd'], color='steelblue', edgecolor='navy')

# Add value labels
for bar, value in zip(bars, data['revenue_musd']):
    ax.text(value + 20, bar.get_y() + bar.get_height()/2, 
            f'${value:,.0f}M', va='center', fontsize=10)

ax.set_xlabel('Revenue (Million USD)', fontsize=12)
ax.set_title('üé¨ Top 10 Highest-Grossing Movies', fontsize=14, fontweight='bold')
ax.set_xlim(0, data['revenue_musd'].max() * 1.15)

plt.tight_layout()
plt.show()

In [None]:
# Chart 2: Budget vs Revenue Scatter Plot
fig, ax = plt.subplots(figsize=(10, 8))

# Get full dataset as pandas
full_df = analyzer.df.select("title", "budget_musd", "revenue_musd", "roi").toPandas()
full_df = full_df.dropna()

# Create scatter plot
scatter = ax.scatter(
    full_df['budget_musd'], 
    full_df['revenue_musd'],
    c=full_df['roi'],
    cmap='RdYlGn',
    s=100,
    alpha=0.7,
    edgecolors='black'
)

# Add colorbar for ROI
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('ROI (Revenue/Budget)', fontsize=11)

# Add break-even line (revenue = budget)
max_val = max(full_df['budget_musd'].max(), full_df['revenue_musd'].max())
ax.plot([0, max_val], [0, max_val], 'r--', alpha=0.5, label='Break-even line')

# Label some points
for _, row in full_df.nlargest(3, 'revenue_musd').iterrows():
    ax.annotate(row['title'], (row['budget_musd'], row['revenue_musd']),
                xytext=(5, 5), textcoords='offset points', fontsize=9)

ax.set_xlabel('Budget (Million USD)', fontsize=12)
ax.set_ylabel('Revenue (Million USD)', fontsize=12)
ax.set_title('üí∞ Budget vs Revenue (Color = ROI)', fontsize=14, fontweight='bold')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Chart 3: Franchise Revenue Comparison
if len(franchise_stats) > 0:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    data = franchise_stats.sort_values('total_revenue_musd', ascending=True)
    
    colors = plt.cm.viridis(range(0, 256, 256 // len(data)))
    bars = ax.barh(data['belongs_to_collection'], data['total_revenue_musd'], color=colors)
    
    for bar, value in zip(bars, data['total_revenue_musd']):
        ax.text(value + 20, bar.get_y() + bar.get_height()/2, 
                f'${value:,.0f}M', va='center', fontsize=10)
    
    ax.set_xlabel('Total Revenue (Million USD)', fontsize=12)
    ax.set_title('üé≠ Movie Franchise Revenue', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
else:
    print("No franchise data available")

In [None]:
# Chart 4: ROI Distribution
fig, ax = plt.subplots(figsize=(10, 6))

roi_data = full_df['roi'].dropna()

ax.hist(roi_data, bins=15, color='teal', edgecolor='black', alpha=0.7)
ax.axvline(roi_data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {roi_data.mean():.2f}x')
ax.axvline(roi_data.median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {roi_data.median():.2f}x')

ax.set_xlabel('ROI (Revenue / Budget)', fontsize=12)
ax.set_ylabel('Number of Movies', fontsize=12)
ax.set_title('üìä Distribution of Return on Investment', fontsize=14, fontweight='bold')
ax.legend()

plt.tight_layout()
plt.show()

---
## Summary

This notebook demonstrated a complete data pipeline using PySpark:

1. ‚úÖ **Fetched** movie data from TMDB API with retry logic
2. ‚úÖ **Processed** raw JSON into clean Parquet format
3. ‚úÖ **Analyzed** data using PySpark SQL operations
4. ‚úÖ **Visualized** key insights with Matplotlib

### Key Findings:
- Top grossing movies are primarily franchises (Avengers, Star Wars)
- ROI varies significantly - some lower-budget films outperform blockbusters
- Animation and superhero genres dominate box office returns

In [None]:
# Clean up: Stop Spark session when done
# Uncomment to run:
# spark.stop()
# print("‚úÖ Spark session stopped.")