### Fetching the data

In [None]:
import pandas as pd
import numpy as np
import ast
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
file_path='/Users/gyauk/github/labs/IMBD_movie_analysis/Project1/data/raw/movies.csv'
df=pd.read_csv(file_path)
df.head()

In [None]:
df.columns.to_list()


## Data Preparation & Cleaning

### Drop Irrelevant Columns

In [None]:
df= df.drop(columns =['adult', 'imdb_id', 'original_title', 'video', 'homepage'])
df.head()

In [None]:
for col in df.columns:
    print(f"{col}: {type(df[col].iloc[0])}")

### Evaluate JSON Like column

In [None]:
#Function to convert a string like items to actual python objects which could be list or dictionary 
def evaluate_json_column(column):
    try:
        # checks if value is na otherwise converts to object
        return ast.literal_eval(column) if pd.notna(column) else {}
    except (ValueError, SyntaxError):
        return {}

json_columns = ['belongs_to_collection', 'genres', 'production_countries', 
                'production_companies', 'spoken_languages','credits']

# applies the function to the affected columns
for col in json_columns:
    df[col] = df[col].apply(evaluate_json_column)


df.head()

In [None]:
#checking if above cell worked 
for col in df.columns:
    print(f"{col}: {type(df[col].iloc[0])}")

### Extract and clean key data points

In [None]:
# function to extract name from the  dictionary in the column 
def extract_collection_name(value):
    try:
        if pd.notnull(value) and isinstance(value, dict):
            return value.get('name')
    except (ValueError, SyntaxError):
        return None

# Apply function to the column
df['collection_name'] = df['belongs_to_collection'].apply(extract_collection_name)


In [None]:
df['collection_name'][0]

In [None]:
df['genres'][0]


In [None]:
#separate the mutliple keys called name in the dictionary with '|'
def break_data_points(df, init_column, new_column):
  df[new_column] = df[init_column].apply(lambda x: ' | '.join(d['name'] for d in x) if isinstance(x, list) else None)
  return df[new_column]
      
    
break_data_points(df,'genres','genre_names')


In [None]:
break_data_points(df, 'production_countries', 'cld_production_countries')
break_data_points(df, 'production_companies', 'cld_production_companies')
break_data_points(df, 'spoken_languages', 'original_language')


In [None]:
# df['credits'][0]

In [None]:
#credits column has a dictionary in a list in an outer dictionary
# Function to extract cast names form the credit column whcih is a dictionary
def extract_cast_names(credits):
    #.get() gets the value for the key(cast) in the dictionary if there is no value for cast it reurn an empty list
    # it then goes through each item iin the dictionary with key name and picks it value. dictionary in dictionary 
    return [member['name'] for member in credits.get('cast', [])]

# Function to extract crew names
def extract_crew_names(credits):
    #.get() gets the value for the key(crew) in the dictionary if there is no value for cast it reurn an empty list
    return [member['name'] for member in credits.get('crew', [])]

# Function to extract director names
#credits dictionary contains a list of crew members with a dictionary in it and the function looks for the key with the job 'Director'.
def extract_director(credits):
    for member in credits.get('crew', []):
        if member.get('job') == 'Director':
            return member.get('name')
    return None

# Extract and add new columns
df['cast'] = df['credits'].apply(lambda x: ' | '.join(extract_cast_names(x)))
df['crew'] = df['credits'].apply(lambda x: ' | '.join(extract_crew_names(x)))
df['director'] = df['credits'].apply(extract_director)
df['cast_size'] = df['credits'].apply(lambda x: len(x.get('cast', [])))
df['crew_size'] = df['credits'].apply(lambda x: len(x.get('crew', [])))


In [None]:
df['cast'].head()
df['crew'].head()
df['director'].head()
df['cast_size'].head()
df['crew_size'].head()

### Identify Anomalies with value_count()

In [None]:
# function to pick the value_counts of each specified column
def get_value_counts(df, column):
    return df[column].value_counts()
    

In [None]:
get_value_counts(df, 'genre_names')

In [None]:
get_value_counts(df, 'cld_production_countries')

In [None]:
get_value_counts(df, 'original_language')

In [None]:
# 
def normalize_anomalies(genre_string):
    # converts the split genres names to a list and sorts them and returns them to have them be unique
    genres = list(genrestring.strip() for genrestring in genre_string.split('|'))
    sorted_genres = sorted(genres)
    return ' | '.join(sorted_genres)

df['genre_names']= df['genre_names'].apply(normalize_anomalies)
df['cld_production_countries']= df['cld_production_countries'].apply(normalize_anomalies)
df['cld_production_companies']= df['cld_production_companies'].apply(normalize_anomalies)
df['original_language']= df['original_language'].apply(normalize_anomalies)



### testing anomalies

In [None]:
df['genre_names'].value_counts()

In [None]:
df['cld_production_countries'].value_counts()


In [None]:
df['cld_production_companies'].value_counts()


In [None]:
df['original_language'].value_counts()

### Handling Missing & Incorrect Data

In [None]:
def convert_to_numeric(df,column):
         df[column] = pd.to_numeric(df[column],errors='coerce')
         return  df[column].info()

In [None]:
convert_to_numeric(df, 'id')
convert_to_numeric(df, 'popularity')
convert_to_numeric(df,'budget')

In [None]:
def convert_to_datetime(df,column):
         df[column] = pd.to_datetime(df[column])
         return df[column].info()

In [None]:
convert_to_datetime(df,'release_date')

### Replace unrealistic values:

In [None]:
def check_zero_in_column(df, column):

    if (df[column] == 0).any():
        print(f"Column '{column}' contains at least one value equal to 0.")
    else:
        print(f"No zero values found in column '{column}'.")
        

check_zero_in_column(df, 'budget')
check_zero_in_column(df, 'revenue')
check_zero_in_column(df, 'runtime')


- Convert Budget and Revenue to Million USD

In [None]:
df['budget_musd'] = df['budget'] / 1_000_000
df['revenue_musd'] = df['revenue']/ 1_000_000
df['revenue_musd'] = df['revenue_musd'].round(2)
df.drop(columns= ['budget','revenue'], inplace= True)

df['budget_musd'].head()
df['revenue_musd'].head()



In [None]:
df['vote_count'][0]

In [None]:
def vote_count_zero(df, column):
    return df[df[column]==0]
        
vote_count_zero(df, 'vote_count')


In [None]:
df['tagline'].nunique()

In [None]:
df['overview'].nunique()


In [None]:
def check_for_nodata(df, column):
    return df[df[column]==0]
        
check_for_nodata(df, 'overview')
check_for_nodata(df, 'tagline')


In [None]:
#converts all dictionaries and lists to strings to check for duplicate
df_str = df.applymap(lambda x: str(x) if isinstance(x, (dict, list)) else x)
duplicates = df_str.duplicated()
duplicates

In [None]:
def released_movies(df,title_column):
    df_new=df[df['status'] == 'Released']
    df_new.drop(columns=['status'], inplace=True)
    return df_new[title_column]

In [None]:
released_movies(df,'title')

In [None]:
# List of columns new order 
new_order =['id', 'title', 'tagline', 'release_date', 'genres', 'belongs_to_collection', 
            'original_language', 'budget_musd', 'revenue_musd', 'production_companies', 'production_countries', 
            'vote_count', 'vote_average', 'popularity', 'runtime', 'overview',
            'spoken_languages', 'poster_path', 'cast', 'cast_size', 'director', 'crew_size']

# Reorder DataFrame
reordered_df = df[new_order]



In [None]:
reordered_df.reset_index(drop=True)

In [None]:
reordered_df.columns

In [None]:
reordered_df.to_csv(f"/Users/gyauk/github/labs/IMBD_movie_analysis/Project1/data/processed/reordered_movies.csv", index=False)

In [None]:
reordered_df.head()

In [None]:
reordered_df.columns

### KPI Implementation & Analysis

- highest revenue

In [None]:
def highest_revenue_movie(title,revenue_column):
    max_row = reordered_df.loc[reordered_df[revenue_column].idxmax()]
    return print(f'{max_row[title]} generated the most revenue of USD {max_row[revenue_column]}')

highest_revenue_movie('title','revenue_musd')
   

- Highest Budget

In [None]:
def highest_budget_movie(title,budget_column):
    # Find the row with the maximum revenue
    max_row = reordered_df.loc[reordered_df[budget_column].idxmax()]
    return print(f'{max_row[title]} had the highest budget of USD {max_row[budget_column]}')

highest_budget_movie('title','revenue_musd')

- Highest Profit (Revenue - Budget)



In [None]:
def highest_profit_movie(title,revenue_column,budget_column,profit_column):
    # find the highest profiting movie is difference b/n revenue-budget
     # Calculate profit and assign to new column profit
    reordered_df['profit'] = reordered_df[revenue_column] - reordered_df[budget_column]

    # Find the row with the highest profit
    highest_profit_row = reordered_df.loc[reordered_df[profit_column].idxmax()]
    
    return print(f'{highest_profit_row[title]} had the highest profit of USD {highest_profit_row[profit_column]}') 

highest_profit_movie('title','revenue_musd','budget_musd','profit')


- Lowest Profit (Revenue - Budget)


In [None]:
def lowest_profit_movie(title,revenue_column,budget_column,profit_column):
    # find the highest revenue-budget
     # Calculate profit
    # reordered_df['low_profit'] =  reordered_df[budget_column]-reordered_df[revenue_column] 

    # Find the row with the highest profit
    lowest_profit_row = reordered_df.loc[reordered_df[profit_column].idxmin()]
    return print(f'{lowest_profit_row[title]} had the highest profit of USD {lowest_profit_row[profit_column]}') 
    
lowest_profit_movie('title','revenue_musd','budget_musd','profit')


- Highest ROI (Revenue / Budget) (only movies with Budget ≥ 10M) o Lowest ROI (only movies with Budget ≥ 10M)


In [None]:
def highest_roi(title,revenue_column,budget_column,roi_column):
     df_roi_range = reordered_df[reordered_df[budget_column] >= 100]
      # Calculate roi and plcae in roi column
     df_roi_range['roi'] =  df_roi_range[revenue_column] / df_roi_range[budget_column]

    # Find the row with the highest roi
     highest_roi = df_roi_range.loc[df_roi_range[roi_column].idxmax()]
    
     return print(f'{highest_roi[title]} had the highest profit of USD {highest_roi[roi_column]}') 
highest_roi('title','revenue_musd','budget_musd','roi')



- lowest ROI

In [None]:
def lowest_roi(title,revenue_column,budget_column,roi_column): 
      df_roi_range = reordered_df[reordered_df[budget_column] >= 100]    

      df_roi_range['roi'] =  df_roi_range[revenue_column] / df_roi_range[budget_column]

    # Find the row with the highest profit
      lowest_ROI = df_roi_range.loc[df_roi_range['roi'].idxmin()]
      return print(f'{lowest_ROI[title]} had the lowest roi of {lowest_ROI[roi_column]}')
    
lowest_roi('title','revenue_musd','budget_musd','roi')


- Most Voted Movie

In [None]:
def most_voted(title,vote_column): 
    most_voted_df=reordered_df.loc[reordered_df[vote_column].idxmax()]
    
    return print(f'{most_voted_df[title]} was the most voted movie with {most_voted_df[vote_column]} votes.')

most_voted('title','vote_count')

- Highest Rated Movie

In [None]:
def highest_rated(title,vote_column,vote_average):
     df_rated_range = reordered_df[reordered_df[vote_column] >= 10]
     highest_rated_movie_df = df_rated_range.loc[df_rated_range[vote_average].idxmax()]    
     return print(f'{highest_rated_movie_df[title]} was the most highest rated movie with {highest_rated_movie_df[vote_average]} ratings.')

highest_rated('title','vote_count','vote_average')


- Lowest Rated Movie

In [None]:
def lowest_rated(title,vote_column,vote_average):
     df_rated_range = reordered_df[reordered_df[vote_column] >= 10]
     lowest_rated_movie_df = df_rated_range.loc[df_rated_range[vote_average].idxmin()]
     return print(f'{lowest_rated_movie_df[title]} was the lowest rated movie with {lowest_rated_movie_df[vote_average]} ratings.')
     
lowest_rated('title','vote_count','vote_average')    

- Most Popular

In [None]:
def most_popular(title,popular_column):
   mostpopular_df= reordered_df.loc[reordered_df[popular_column].idxmax()]
   return print(f'{mostpopular_df[title]} was the most popular movie with popularity of {mostpopular_df[popular_column]}')

most_popular('title','popularity')    


### Advanced Movie Filtering & Search Queries

- Search 1: Find the best-rated Science Fiction Action movies starring Bruce Willis (sorted by Rating - highest to lowest)

In [None]:
#filter by science fiction first 
specific_genres = ['Science Fiction', 'Action']
filtered_genre_df = reordered_df[reordered_df['genres'].apply(lambda genres: any(genre['name'] in specific_genres for genre in genres))]
filtered_genre_df

#fileter by actor bruce willis
filter_actor_df=filtered_genre_df[filtered_genre_df['cast'].apply(lambda cast: 'Robert' in cast)]
filter_actor_df

# Sort by the 'vote_average' column (rating) in descending order
sorted_movies = filter_actor_df.sort_values(by='vote_average', ascending=False)

# Select relevant columns to display
best_rated_movies = sorted_movies[['id', 'title', 'vote_average', 'cast', 'genres']]

# Display the best-rated movies
print(best_rated_movies)

- Search 2: Find movies starring Uma Thurman, directed by Quentin Tarantino (sorted by runtime - shortest to longest).

### Franchise vs. Standalone Movie Performance

In [None]:
# Create a new column to identify franchise vs standalone
df['is_franchise'] = df['collection_name'].notna()

# Group by franchise status and calculate mean revenue
mean_revenue_comparison = df.groupby('is_franchise')['revenue_musd'].mean().reset_index()

# Rename columns for clarity
mean_revenue_comparison.columns = ['Is_Franchise', 'Mean_Revenue_musd']

#Converting true/false to proper labels
mean_revenue_comparison['Is_Franchise'] = mean_revenue_comparison['Is_Franchise'].map({
    True: 'Franchise', 
    False: 'Standalone'
})


mean_revenue_comparison

In [None]:
df['roi'] =  df['revenue_musd'] / df['budget_musd']
# Group by franchise status and calculate mean revenue
median_roi_comparison = df.groupby('is_franchise')['roi'].median().reset_index()

# Rename columns for clarity
median_roi_comparison.columns = ['Is_Franchise', 'median_roi']

# Optional: Convert True/False to readable labels
median_roi_comparison['Is_Franchise'] = median_roi_comparison['Is_Franchise'].map({
    True: 'Franchise', 
    False: 'Standalone'
})

median_roi_comparison

mean budget raised

In [None]:
# Create a new column to identify franchise vs standalone
df['is_franchise'] = df['collection_name'].notna()

# Group by franchise status and calculate mean revenue
mean_budget_comparison = df.groupby('is_franchise')['budget_musd'].mean().reset_index()

# Rename columns for clarity
mean_budget_comparison.columns = ['Is_Franchise', 'Mean_Budget_musd']

#Converting true/false to proper labels
mean_budget_comparison['Is_Franchise'] = mean_revenue_comparison['Is_Franchise'].map({
    True: 'Franchise', 
    False: 'Standalone'
})


mean_budget_comparison

Mean Popularity

In [None]:
# Create a new column to identify franchise vs standalone
df['is_franchise'] = df['collection_name'].notna()

# Group by franchise status and calculate mean revenue
mean_popularity_comparison = df.groupby('is_franchise')['popularity'].mean().reset_index()

# Rename columns for clarity
mean_popularity_comparison.columns = ['Is_Franchise', 'Mean_popularity']

# Optional: Convert True/False to readable labels
mean_popularity_comparison['Is_Franchise'] = mean_popularity_comparison['Is_Franchise'].map({
    True: 'Franchise', 
    False: 'Standalone'
})

mean_popularity_comparison

Mean rating

In [None]:
# Create a new column to identify franchise vs standalone
df['is_franchise'] = df['collection_name'].notna()

# Group by franchise status and calculate mean revenue
mean_popularity_comparison = df.groupby('is_franchise')['popularity'].mean().reset_index()

# Rename columns for clarity
mean_popularity_comparison.columns = ['Is_Franchise', 'Mean_popularity']

# Optional: Convert True/False to readable labels
mean_popularity_comparison['Is_Franchise'] = mean_popularity_comparison['Is_Franchise'].map({
    True: 'Franchise', 
    False: 'Standalone'
})

# Display result
mean_popularity_comparison

-   mean rating

In [None]:
# Create a new column to identify franchise vs standalone
df['is_franchise'] = df['collection_name'].notna()

# Group by franchise status and calculate mean revenue
mean_rating_comparison = df.groupby('is_franchise')['vote_average'].mean().reset_index()

# Rename columns for clarity
mean_rating_comparison.columns = ['Is_Franchise', 'Mean_vote_average']

# Optional: Convert True/False to readable labels
mean_rating_comparison['Is_Franchise'] = mean_rating_comparison['Is_Franchise'].map({
    True: 'Franchise', 
    False: 'Standalone'
})

# Display result
mean_rating_comparison

### Most Successful Franchises & Directors

In [None]:
franchise_counts = df['collection_name'].value_counts()
print('Most successful franchise is', franchise_counts.idxmax(),'with',franchise_counts.max(),'movies in a franchise')

In [None]:
Franchise_df= df.groupby('collection_name')
Franchise_df.head()

In [None]:
franchise_summary = Franchise_df.agg(
    movie_count=('id', 'count'),
    total_budget=('budget_musd', 'sum'),
    mean_budget=('budget_musd', 'mean'),
    total_revenue=('revenue_musd', 'sum'),
    mean_revenue=('revenue_musd', 'mean'),
    mean_rating=('vote_average', 'mean')
    
    
).reset_index()


In [None]:
franchise_summary

In [None]:
franchise_summary.to_csv(f"/Users/gyauk/github/labs/IMBD_movie_analysis/Project1/data/processed/franchise.csv", index=False)


In [None]:
def sort_most_successful_movieinfranchise(collection_name,column):
     x=franchise_summary.sort_values(column)
     #iloc to loc because it accesses a row by its index label
     return print(f"{x[collection_name].iloc[-1]} is the most sucessful movie franschise")

sort_most_successful_movieinfranchise('collection_name','movie_count')

In [None]:
def sort_total_budget(collection_name,column):
    x=franchise_summary.sort_values(column)
    return print(f"{x[collection_name].iloc[-1]} is the most sucessful movie franschise")



sort_total_budget('collection_name','total_budget')



In [None]:
def sort_mean_budget(collection_name,column):
    x=franchise_summary.sort_values(column)
    return print(f"{x[collection_name].iloc[-1]} is the most sucessful movie franschise")


sort_mean_budget('collection_name','mean_budget')



In [None]:
def sort_total_revenue(collection_name,column):
  x=franchise_summary.sort_values(column)
  return print(f"{x[collection_name].iloc[-1]} is the most sucessful movie franschise")


sort_total_revenue('collection_name','total_revenue')



In [None]:
def sort_mean_revenue(collection_name,column):
    x=franchise_summary.sort_values(column)
    return print(f"{x[collection_name].iloc[-1]} is the most sucessful movie franschise")

sort_total_budget('collection_name','mean_revenue')  

In [None]:
def sort_mean_rating(collection_name,column):
    x=franchise_summary.sort_values(column)
    return print(f"{x[collection_name].iloc[-1]} is the most sucessful movie franschise")

sort_total_budget('collection_name','mean_rating')  

In [None]:
# franchise_budgets.head()

In [None]:
Franchise_director_df= reordered_df.groupby('director')
Franchise_director_df.head()


In [None]:
franchise_director = Franchise_director_df.agg(
    num_movies_directed=('id', 'count'),
    total_revenue=('revenue_musd', 'sum'),
    mean_rating=('vote_average', 'mean')
).reset_index()


In [None]:
franchise_director
franchise_director.to_csv(f"/Users/gyauk/github/labs/IMBD_movie_analysis/Project1/data/processed/franchise_director.csv", index=False)

In [None]:
def most_movies_directed(num_movies_directed,director_name):
    x=franchise_director.sort_values(num_movies_directed)
    return print(f'{x[director_name].iloc[-1]} has directed {x[num_movies_directed].iloc[-1]} movies.')

most_movies_directed('num_movies_directed','director')

In [None]:
def most_successful_director_by_revenue(total_revenue,director_name):
    x=franchise_director.sort_values(total_revenue)
    return print(f'{x[director_name].iloc[-1]} is the most successful by generating an amount of USD {x[total_revenue].iloc[-1]} in revenue.')

most_successful_director_by_revenue('total_revenue','director')

In [None]:
def successful_director_meanrating(mean_rating,director_name):
    x=franchise_director.sort_values(mean_rating)
    return print(f'{x[director_name].iloc[-1]} is the most successful by having a mean rating of {x[mean_rating].iloc[-1]}.')

successful_director_meanrating('mean_rating','director')

In [None]:
reordered_df.to_csv('/Users/gyauk/github/labs/IMBD_movie_analysis/Project1/data/processed/reordered_movies.csv', index=False)


### Data Visualization

- Revenue vs. Budget Trends

In [None]:
# lineplot of Revenue vs. Budget Trends
plt.figure(figsize=(10, 6))
plt.scatter(data=reordered_df, x='budget_musd', y='revenue_musd')

# Adding title and labels
plt.title('Revenue vs. Budget Trends', fontsize=16)
plt.xlabel('Budget  (USD millions)')
plt.ylabel('Revenue (USD millions)')

plt.show()

- ROI Distribution by Genre

In [None]:
# df['franchise'] = df['franchise'].astype(str)
plt.figure(figsize=(30, 15))
sns.lineplot(x = 'genre_names', y = 'roi', data = df)
plt.xticks(rotation=90)
plt.show()



- Popularity vs. Rating

In [None]:
# Scatter plot of Rating vs Popularity
plt.figure(figsize=(10, 6))
plt.scatter(data=df, x='popularity', y='vote_average', alpha=0.6, color='blue')

# Adding title and labels
plt.title('Popularity vs Movie Rating ')
plt.xlabel('Popularity')
plt.ylabel('Rating (vote_average)')

plt.show()


- Yearly Trends in Box Office Performance

In [None]:
# Add hue parameter to the pointplot to plot for each state
reordered_df['release_year'] = reordered_df['release_date'].dt.year
plt.figure(figsize=(15, 7)) # To resize the plot
yearly_df = reordered_df.groupby('release_year').agg({
        'revenue_musd': 'sum',
        'budget_musd': 'sum'
    }).reset_index()
sns.pointplot(x='release_year', y='revenue_musd', data=yearly_df)
plt.legend(bbox_to_anchor=(1, 1))
plt.xticks(rotation=90)
plt.show()

- Comparison of Franchise vs. Standalone Success

In [None]:
# Group and calculate metrics
franchise_group = df.groupby('is_franchise').agg({
    'revenue_musd': 'mean',
    'roi': 'median',
    'popularity': 'mean',
    'vote_average': 'mean'
}).reset_index()

# Rename franchise status
franchise_group['is_franchise'] = franchise_group['is_franchise'].map({True: 'Franchise', False: 'Standalone'})

# Transpose for desired legend
franchise_group.set_index('is_franchise', inplace=True)
franchise_group = franchise_group.T  
# Plot
plt.figure(figsize=(12, 7))
franchise_group.plot(kind='bar')
plt.title('Comparison of Franchise vs. Standalone Success')
plt.ylabel('Average / Median Values')
plt.xticks(rotation=0)
plt.legend(title='Movie Type')  
plt.show()

