In [None]:
import sys
import os
# Add 'src' directory to Python path
src_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if src_path not in sys.path:
    sys.path.append(src_path)

In [None]:

from pyspark.sql import SparkSession
from src import helper_functions as hf
from pyspark.sql.types import *
from pyspark.sql.functions import * 


### Fetching the data

In [None]:
# Initialize Spark
spark = SparkSession.builder.appName("TMDB Movies").getOrCreate()

# Load JSON data using the schema
json_path = "/Users/gyauk/github/labs/Pyspark_IMBD_movie_analysis/data/raw/movies.json"
df = spark.read.schema(hf.schema_build()).json(json_path)


In [None]:
df.show(5)

### Data Preparation & Cleaning

### Drop Irrelevant Columns

In [None]:
# Drop columns 
cols_to_drop = ['adult', 'imdb_id', 'original_title', 'video', 'homepage']
df = df.drop(*cols_to_drop)

In [None]:
# Check how nested data looks
df.select("genres", "production_companies", "spoken_languages", "credits.cast", "credits.crew").show(1, truncate=True)

### Evaluate JSON Like column and extract and clean data points

In [None]:
df= hf.extract_and_clean_json_columns(df)

### Handling Missing & Incorrect Data

In [None]:
hf.value_counts(df, "genre_names").show(truncate=False)
hf.value_counts(df, "spoken_languages").show(truncate=False)
hf.value_counts(df, "collection_name").show(truncate=False)

- Convert column datatypes

In [None]:
# converting some columns to proper dataypes 
df=hf.convert_column_types(df)

- Replace unrealistic values:

In [None]:
df=hf.replace_unrealistic_data(df)

- Remove duplicates

In [None]:
df=hf.clean_duplicates_and_missing_data(df)

- extracting 'cast', 'cast_size', 'director', 'crew_size' from credits column 

In [None]:
df=hf.extract_credits_info(df)

### Reorder & Finalize DataFrame

In [None]:
new_order = ['id', 'title', 'tagline', 'release_date', 'genres', 'belongs_to_collection',
'original_language', 'budget_musd', 'revenue_musd', 'production_companies',
'production_countries', 'vote_count', 'vote_average', 'popularity', 'runtime',
'overview', 'spoken_languages', 'poster_path', 'cast', 'cast_size', 'director', 'crew_size']

df_reordered = df.select(*[col for col in new_order if col in df.columns])

In [None]:
df_reordered.columns

In [None]:
df_reordered.show(1, Truncate)

### hf Implementation & Analysis

- highest revenue

In [None]:
df=hf.highest_revenue_movie(df,'title','revenue_musd')
   

- Highest Budget

In [None]:
hf.highest_budget_movie(df,'title','budget_musd')

- Highest Profit (Revenue - Budget)



In [None]:
df= hf.highest_profit_movie(df,'title','revenue_musd','budget_musd')


- Lowest Profit (Revenue - Budget)


In [None]:
df= hf.lowest_profit_movie(df,'title','revenue_musd','budget_musd')

- Highest ROI (Revenue / Budget) (only movies with Budget ≥ 10M) o Lowest ROI (only movies with Budget ≥ 10M)


In [None]:
df= hf.highest_roi(df,'title','revenue_musd','budget_musd')

In [None]:
df= hf.lowest_roi(df,'title','revenue_musd','budget_musd')


- Most Voted Movie

In [None]:
df= hf.most_voted(df,'title','vote_count')

- Highest Rated Movie

In [None]:
df= hf.highest_rated(df,'title','vote_count','vote_average')


- Lowest Rated Movie

In [None]:
df= hf.lowest_rated(df,'title','vote_count','vote_average')    

- Most Popular

In [None]:
df= hf.most_popular(df,'title','popularity')    


In [None]:
df.columns

### Advanced Movie Filtering & Search Queries

- Search 1: Find the best-rated Science Fiction Action movies starring Bruce Willis (sorted by Rating - highest to lowest)

In [None]:
df=hf.advanced_search_rating(df)

- Search 2: Find movies starring Uma Thurman, directed by Quentin Tarantino (sorted by runtime - shortest to longest).

In [None]:
df=hf.advanced_search_runtime(df)

### Franchise vs. Standalone Movie Performance

- Creating and populating a "is_franchise" column

In [None]:
df_with_franchise_flag = hf.add_is_franchise_column(df)
df_with_franchise_flag.select("title", "is_franchise").show(5)

In [None]:
df_with_franchise_flag.columns

- mean revenue

In [None]:
mean_revenue_df = hf.mean_revenue_by_franchise(df_with_franchise_flag)
mean_revenue_df.show()

Median ROI

In [None]:
median_roi=hf.median_roi_by_franchise(df_with_franchise_flag)
median_roi.show()

Mean Popularity

In [None]:
mean_popularity=hf.mean_popularity_by_franchise(df_with_franchise_flag)
mean_popularity.show()

mean budget raised

In [None]:
mean_budget=hf.mean_rating_by_franchise(df_with_franchise_flag)
mean_budget.show()

### Most Successful Franchises & Directors

In [None]:
franchise_summary=hf.generate_franchise_summary(df_with_franchise_flag)
franchise_summary.show()

In [None]:
franchise_summary=hf.sort_mean_budget(franchise_summary,'collection_name','mean_budget')

In [None]:
franchise_summary.columns

In [None]:
franchise_summary=hf.sort_total_budget(franchise_summary,'collection_name','total_budget')

In [None]:
franchise_summary=hf.sort_total_revenue(franchise_summary,'collection_name','total_revenue')

In [None]:
franchise_summary=hf.sort_mean_revenue(franchise_summary,'collection_name','mean_revenue')


In [None]:
franchise_summary=hf.sort_mean_rating(franchise_summary,'collection_name','mean_rating')

In [None]:
franchise_summary=hf.sort_most_successful_movieinfranchise(franchise_summary,'collection_name','movie_count')


In [None]:
franchise_summary=hf.generate_director_df(df)

In [None]:
# franch.generate_director_df(reordered_df)
director_df= hf.generate_director_df(df)
director_df.show()

In [None]:
hf.most_movies_directed(director_df,'director','movie_count')

In [None]:
hf.most_successful_director_by_revenue(director_df,'director','total_revenue')


In [None]:
hf.successful_director_meanrating(director_df,'director','mean_rating')


### Data Visualization

- Revenue vs. Budget Trends

In [None]:
hf.revenue_vs_budget(df)

- ROI Distribution by Genre

In [None]:
hf.roi_distribution_by_genre(df)

- Popularity vs. Rating

In [None]:
hf.popularity_vs_rating(df)

- Yearly Trends in Box Office Performance

In [None]:
hf.yearly_box_office_performance(df_with_franchise_flag)

- Comparison of Franchise vs. Standalone Success

In [None]:
hf.franchise_vs_standalone_success(df_with_franchise_flag)