### In this notebook, we are going to process the extracted movie data 
### The main objective here is to transform the API data into clean, analysis-ready dataset

### This notebook performs: 
#### 1. Column selection and dropping
#### 2. JSON field extraction 
#### 3. Data type conversions
#### 4. Missing value handling 
#### 5. Data validation 

In [21]:
import pandas as pd
import numpy as np
import json
from datetime import datetime

# reload raw movie/api data from notebook 01
with open('../data/raw/movies_raw.json', 'r') as f:
    movies_raw = json.load(f)

df = pd.DataFrame(movies_raw)
print(f"Starting shape: {df.shape}")
print(f"Initial columns: {len(df.columns)}")

Starting shape: (18, 27)
Initial columns: 27


##### Dropping Irrelevant Columns
###### Rationale: These columns don't contribute to our analysis: 
###### adult: All movies are non-adult in this dataset (All false in the adult colum)
###### imdb_id: We're using TMDB IDs as primarhy key
###### original_title: 'title' is sufficient
###### video: Boolean flag, not usefu for analysis
###### homepage: URLs not needed for metrics


In [22]:
columns_to_drop = ['adult', 'imdb_id', 'original_title', 'video', 'homepage']
#  keeping 'credits' for cast/crew extraction later
df.drop(columns=[col for col in columns_to_drop if col in df.columns], 
        inplace=True)

print(f"Shape after dropping columns: {df.shape}")
print(f"Credits column present: {'credits' in df.columns}")

df.columns

Shape after dropping columns: (18, 22)
Credits column present: True


Index(['backdrop_path', 'belongs_to_collection', 'budget', 'genres', 'id',
       'origin_country', 'original_language', 'overview', 'popularity',
       'poster_path', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'title', 'vote_average', 'vote_count', 'credits'],
      dtype='object')

##### Extract JSON Fields

In [23]:
def extract_collection_name(collection_data):
    """Extract collection name from belongs_to_collection field"""
    if pd.isna(collection_data) or collection_data is None:
        return None
    if isinstance(collection_data, dict):
        return collection_data.get('name')
    return None

def extract_names(json_list, key='name'):
    """Extract names from list of dictionaries"""
    # Handle None and NaN values
    if json_list is None:
        return None
    if isinstance(json_list, float) and pd.isna(json_list):
        return None
    if not json_list:  # Empty list
        return None
    if isinstance(json_list, list):
        names = [item.get(key, '') for item in json_list if isinstance(item, dict)]
        return '|'.join(names) if names else None
    return None

# Apply extractions
df['belongs_to_collection'] = df['belongs_to_collection'].apply(extract_collection_name)
df['genres'] = df['genres'].apply(extract_names)
df['spoken_languages'] = df['spoken_languages'].apply(lambda x: extract_names(x, 'english_name'))
df['production_countries'] = df['production_countries'].apply(lambda x: extract_names(x, 'name'))
df['production_companies'] = df['production_companies'].apply(lambda x: extract_names(x, 'name'))

print("JSON extraction complete")
print(f"\nGenres value counts:\n{df['genres'].value_counts().head()}")

JSON extraction complete

Genres value counts:
genres
Adventure|Action|Science Fiction             3
Action|Adventure|Science Fiction|Thriller    2
Action|Adventure|Science Fiction             2
Action|Adventure|Fantasy|Science Fiction     1
Drama|Romance                                1
Name: count, dtype: int64


#### Extracting Cast and Crew information

In [24]:

def extract_cast(credits):
    """Extract cast information"""
    if pd.isna(credits) or credits is None:
        return None, 0
    if isinstance(credits, dict):
        cast_list = credits.get('cast', [])
        if not cast_list:
            return None, 0
        # Get top 5 cast members
        top_cast = [member.get('name', '') for member in cast_list[:5]]
        return '|'.join(top_cast), len(cast_list)
    return None, 0

def extract_director(credits):
    """Extract director name and crew size"""
    if pd.isna(credits) or credits is None:
        return None, 0
    if isinstance(credits, dict):
        crew_list = credits.get('crew', [])
        crew_size = len(crew_list)
        # Find director
        directors = [member.get('name') for member in crew_list 
                    if member.get('job') == 'Director']
        director = directors[0] if directors else None
        return director, crew_size
    return None, 0

# Apply extractions
df[['cast', 'cast_size']] = df['credits'].apply(
    lambda x: pd.Series(extract_cast(x)))
df[['director', 'crew_size']] = df['credits'].apply(
    lambda x: pd.Series(extract_director(x)))

# Drop original credits column
df.drop(columns=['credits'], inplace=True)

print(f"Cast/Crew extraction complete")
print(f"Directors found: {df['director'].notna().sum()}")

Cast/Crew extraction complete
Directors found: 18


##### Data Type Conversions
###### Numeric: budget, revenue, id, popularity, runtime, vote_count, vote_average
###### Datetime: release_date
###### String: title, tagline, overview

In [25]:
# Numeric conversions
numeric_cols = ['budget', 'id', 'popularity', 'revenue', 
                'runtime', 'vote_count', 'vote_average']

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Datetime conversion
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

print("Data type conversions complete")
print(f"\nData types:\n{df.dtypes}")


Data type conversions complete

Data types:
backdrop_path                    object
belongs_to_collection            object
budget                            int64
genres                           object
id                                int64
origin_country                   object
original_language                object
overview                         object
popularity                      float64
poster_path                      object
production_companies             object
production_countries             object
release_date             datetime64[ns]
revenue                           int64
runtime                           int64
spoken_languages                 object
status                           object
tagline                          object
title                            object
vote_average                    float64
vote_count                        int64
cast                             object
cast_size                         int64
director                         obj

In [None]:
df.head(5)

Unnamed: 0,backdrop_path,belongs_to_collection,budget,genres,id,origin_country,original_language,overview,popularity,poster_path,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,cast_size,director,crew_size
0,/9wXPKruA6bWYk2co5ix6fH59Qr8.jpg,The Avengers Collection,356000000,Adventure|Science Fiction|Action,299534,[US],en,After the devastating events of Avengers: Infi...,12.0878,/bR8ISy1O9XQxqiy0fQFw2BX72RQ.jpg,...,English|Japanese|Xhosa,Released,Avenge the fallen.,Avengers: Endgame,8.237,26978,Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,105,Anthony Russo,607
1,/7JNzw1tSZZEgsBw6lu0VfO2X2Ef.jpg,Avatar Collection,237000000,Action|Adventure|Fantasy|Science Fiction,19995,[US],en,"In the 22nd century, a paraplegic Marine is di...",38.2316,/gKY6q7SjCkAU6FqvqWybDYgUKIF.jpg,...,English|Spanish,Released,Enter the world of Pandora.,Avatar,7.594,32883,Sam Worthington|Zoe Saldaña|Sigourney Weaver|S...,65,James Cameron,990
2,/8BTsTfln4jlQrLXUBquXJ0ASQy9.jpg,Star Wars Collection,245000000,Adventure|Action|Science Fiction,140607,[US],en,Thirty years after defeating the Galactic Empi...,7.5842,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,...,English,Released,Every generation has a story.,Star Wars: The Force Awakens,7.3,20104,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,183,J.J. Abrams,262
3,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,The Avengers Collection,300000000,Adventure|Action|Science Fiction,299536,[US],en,As the Avengers and their allies have continue...,20.7267,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,...,English|Xhosa,Released,Destiny arrives all the same.,Avengers: Infinity War,8.235,31188,Robert Downey Jr.|Chris Evans|Chris Hemsworth|...,69,Joe Russo,733
4,/xnHVX37XZEp33hhCbYlQFq7ux1J.jpg,,200000000,Drama|Romance,597,[US],en,101-year-old Rose DeWitt Bukater tells the sto...,23.4289,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,...,English|French|German|Swedish|Italian|Russian,Released,Nothing on earth could come between them.,Titanic,7.903,26519,Leonardo DiCaprio|Kate Winslet|Billy Zane|Kath...,116,James Cameron,262


#### Handling Invalid and Missing Data

In [26]:
# Replace unrealistic values
"""
Value cleaning strategy:
- Budget/Revenue = 0 → NaN (not missing data, but invalid)
- Vote_count = 0 → Check if vote_average also 0
- Known placeholders → NaN
"""

# Replace 0 with NaN for budget and revenue
df.loc[df['budget'] == 0, 'budget'] = np.nan
df.loc[df['revenue'] == 0, 'revenue'] = np.nan
df.loc[df['runtime'] == 0, 'runtime'] = np.nan

# Handle vote_count = 0
df.loc[df['vote_count'] == 0, 'vote_average'] = np.nan

# Replace placeholder text
placeholder_texts = ['No Data', 'no data', 'N/A', 'n/a', '']
for col in ['overview', 'tagline']:
    if col in df.columns:
        df.loc[df[col].isin(placeholder_texts), col] = np.nan

# Convert to millions USD
df['budget_musd'] = df['budget'] / 1_000_000
df['revenue_musd'] = df['revenue'] / 1_000_000

# Drop original columns
df.drop(columns=['budget', 'revenue'], inplace=True)

print("Invalid value handling complete")
print(f"Budget null count: {df['budget_musd'].isna().sum()}")
print(f"Revenue null count: {df['revenue_musd'].isna().sum()}")

Invalid value handling complete
Budget null count: 0
Revenue null count: 0


#### Data Quality Checks 

In [29]:
# Removing duplicates and invalid rows
"""
Quality checks:
- Remove duplicate IDs
- Drop rows without id or title
- Keep only rows with ≥10 non-null columns
- Filter for 'Released' status only
"""

# Removing duplicates
df.drop_duplicates(subset=['id'], keep='first', inplace=True)
print(f"After duplicate removal: {df.shape}")

# Dropping rows without id or title
df.dropna(subset=['id', 'title'], inplace=True)
print(f"After removing missing id/title: {df.shape}")

# Keep rows with at least 10 non-null values
min_non_null = 10
df = df[df.notna().sum(axis=1) >= min_non_null]
print(f"After {min_non_null}+ column requirement: {df.shape}")

# Filter for 'Released' movies only
if 'status' in df.columns:
    df = df[df['status'] == 'Released']
    df.drop(columns=['status'], inplace=True)
    print(f"After filtering for Released status: {df.shape}")

After duplicate removal: (18, 22)
After removing missing id/title: (18, 22)
After 10+ column requirement: (18, 22)


#### Reordering and Finalization 

In [30]:
# Reorder columns and reset index
"""
Final structure: Logical ordering for analysis
- Identifiers first
- Key metrics in middle
- Descriptive fields last
"""

column_order = [
    'id', 'title', 'tagline', 'release_date', 'genres', 
    'belongs_to_collection', 'original_language', 
    'budget_musd', 'revenue_musd', 'production_companies', 
    'production_countries', 'vote_count', 'vote_average', 
    'popularity', 'runtime', 'overview', 'spoken_languages', 
    'poster_path', 'cast', 'cast_size', 'director', 'crew_size'
]

# Reorder (only include columns that exist)
existing_cols = [col for col in column_order if col in df.columns]
df = df[existing_cols]

# Reset index
df.reset_index(drop=True, inplace=True)

print(f"Final cleaned DataFrame shape: {df.shape}")
print(f"\nColumn order:\n{df.columns.tolist()}")

Final cleaned DataFrame shape: (18, 22)

Column order:
['id', 'title', 'tagline', 'release_date', 'genres', 'belongs_to_collection', 'original_language', 'budget_musd', 'revenue_musd', 'production_companies', 'production_countries', 'vote_count', 'vote_average', 'popularity', 'runtime', 'overview', 'spoken_languages', 'poster_path', 'cast', 'cast_size', 'director', 'crew_size']


In [None]:
df.head()

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,director,crew_size
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure|Science Fiction|Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,...,8.237,12.0878,181.0,After the devastating events of Avengers: Infi...,English|Japanese|Xhosa,/bR8ISy1O9XQxqiy0fQFw2BX72RQ.jpg,Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,105,Anthony Russo,607
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment|Lightstorm Entertainment|20...,...,7.594,38.2316,162.0,"In the 22nd century, a paraplegic Marine is di...",English|Spanish,/gKY6q7SjCkAU6FqvqWybDYgUKIF.jpg,Sam Worthington|Zoe Saldaña|Sigourney Weaver|S...,65,James Cameron,990
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure|Action|Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd.|Bad Robot,...,7.3,7.5842,136.0,Thirty years after defeating the Galactic Empi...,English,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,183,J.J. Abrams,262
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure|Action|Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,...,8.235,20.7267,149.0,As the Avengers and their allies have continue...,English|Xhosa,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,Robert Downey Jr.|Chris Evans|Chris Hemsworth|...,69,Joe Russo,733
4,597,Titanic,Nothing on earth could come between them.,1997-11-18,Drama|Romance,,en,200.0,2264.162353,Paramount Pictures|20th Century Fox|Lightstorm...,...,7.903,23.4289,194.0,101-year-old Rose DeWitt Bukater tells the sto...,English|French|German|Swedish|Italian|Russian,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,Leonardo DiCaprio|Kate Winslet|Billy Zane|Kath...,116,James Cameron,262


#### Further feature engineering on some of the columns (bonus)

In [31]:
# Extract temporal features
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
df['release_quarter'] = df['release_date'].dt.quarter
df['days_since_release'] = (pd.Timestamp.now() - df['release_date']).dt.days

# Calculate ROI and profit
df['roi'] = ((df['revenue_musd'] - df['budget_musd']) / df['budget_musd']) * 100
df['profit_musd'] = df['revenue_musd'] - df['budget_musd']

# Genre count (diversity metric)
df['genre_count'] = df['genres'].str.split('|').str.len()

# Success metrics
df['is_profitable'] = (df['profit_musd'] > 0).astype(int)

In [None]:
df.head()

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,director,crew_size,release_year,release_month,release_quarter,days_since_release,roi,profit_musd,genre_count,is_profitable
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure|Science Fiction|Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,...,Anthony Russo,607,2019,4,2,2421,686.359298,2443.4391,3,1
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment|Lightstorm Entertainment|20...,...,James Cameron,990,2009,12,4,5838,1133.631235,2686.706026,4,1
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure|Action|Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd.|Bad Robot,...,J.J. Abrams,262,2015,12,4,3647,744.172908,1823.223624,3,1
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure|Action|Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,...,Joe Russo,733,2018,4,2,2785,584.138346,1752.415039,3,1
4,597,Titanic,Nothing on earth could come between them.,1997-11-18,Drama|Romance,,en,200.0,2264.162353,Paramount Pictures|20th Century Fox|Lightstorm...,...,James Cameron,262,1997,11,4,10248,1032.081177,2064.162353,2,1


In [32]:
# Outlier detection and handling

# Identify outliers using IQR method
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]

# Check for outliers in key metrics
for col in ['budget_musd', 'revenue_musd', 'runtime', 'popularity']:
    outliers = detect_outliers_iqr(df, col)
    print(f"{col}: {len(outliers)} outliers detected")

budget_musd: 0 outliers detected
revenue_musd: 1 outliers detected
runtime: 2 outliers detected
popularity: 2 outliers detected


In [15]:
# We are going to keep flag the outliers instead of removing them
def flag_outliers(df, column):
    """Create a binary flag for outliers"""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df[f'{column}_is_outlier'] = ((df[column] < lower_bound) | 
                                   (df[column] > upper_bound)).astype(int)
    return df

# Flag outliers
for col in ['budget_musd', 'revenue_musd', 'runtime', 'popularity']:
    df = flag_outliers(df, col)
    print(f"{col}: Outliers flagged in '{col}_is_outlier' column")

budget_musd: Outliers flagged in 'budget_musd_is_outlier' column
revenue_musd: Outliers flagged in 'revenue_musd_is_outlier' column
runtime: Outliers flagged in 'runtime_is_outlier' column
popularity: Outliers flagged in 'popularity_is_outlier' column


In [33]:
# Data validation and quality checks
# Completeness score per row
df['completeness_score'] = df.notna().sum(axis=1) / len(df.columns)

# Validate logical constraints
print("\nData Quality Checks:")
print(f"Movies with revenue < budget: {(df['revenue_musd'] < df['budget_musd']).sum()}")
print(f"Movies with future release dates: {(df['release_date'] > pd.Timestamp.now()).sum()}")
print(f"Invalid runtimes (<5 or >300 min): {((df['runtime'] < 5) | (df['runtime'] > 300)).sum()}")
print(f"Invalid ratings (not 0-10): {((df['vote_average'] < 0) | (df['vote_average'] > 10)).sum()}")


Data Quality Checks:
Movies with revenue < budget: 0
Movies with future release dates: 0
Invalid runtimes (<5 or >300 min): 0
Invalid ratings (not 0-10): 0


In [17]:
# Misssing data report
missing_report = pd.DataFrame({
    'column': df.columns,
    'missing_count': df.isna().sum(),
    'missing_pct': (df.isna().sum() / len(df) * 100).round(2),
    'dtype': df.dtypes
}).sort_values('missing_pct', ascending=False)

print("\nMissing Data Report:")
print(missing_report[missing_report['missing_pct'] > 0])


Missing Data Report:
                                      column  missing_count  missing_pct  \
production_companies    production_companies             18        100.0   
belongs_to_collection  belongs_to_collection             18        100.0   
genres                                genres             18        100.0   
spoken_languages            spoken_languages             18        100.0   
genre_count                      genre_count             18        100.0   
production_countries    production_countries             18        100.0   

                        dtype  
production_companies   object  
belongs_to_collection  object  
genres                 object  
spoken_languages       object  
genre_count            object  
production_countries   object  


In [None]:
df.head()

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,days_since_release,roi,profit_musd,genre_count,is_profitable,budget_musd_is_outlier,revenue_musd_is_outlier,runtime_is_outlier,popularity_is_outlier,completeness_score
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure|Science Fiction|Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,...,2421,686.359298,2443.4391,3,1,0,0,1,0,1.0
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment|Lightstorm Entertainment|20...,...,5838,1133.631235,2686.706026,4,1,0,1,0,1,1.0
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure|Action|Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd.|Bad Robot,...,3647,744.172908,1823.223624,3,1,0,0,0,0,1.0
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure|Action|Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,...,2785,584.138346,1752.415039,3,1,0,0,0,0,1.0
4,597,Titanic,Nothing on earth could come between them.,1997-11-18,Drama|Romance,,en,200.0,2264.162353,Paramount Pictures|20th Century Fox|Lightstorm...,...,10248,1032.081177,2064.162353,2,1,0,0,1,0,0.970588


#### A Comprehensive Data Profiling Summary

In [34]:

def create_profile(df):
    profile = {
        'Total Records': len(df),
        'Total Features': len(df.columns),
        'Memory Usage (MB)': df.memory_usage(deep=True).sum() / 1024**2,
        'Duplicate Rows': df.duplicated().sum(),
        'Numeric Features': len(df.select_dtypes(include=[np.number]).columns),
        'Categorical Features': len(df.select_dtypes(include=['object']).columns),
        'DateTime Features': len(df.select_dtypes(include=['datetime64']).columns),
        'Date Range': f"{df['release_date'].min()} to {df['release_date'].max()}"
    }
    return pd.Series(profile)

print("\nDataset Profile:")
print(create_profile(df))


Dataset Profile:
Total Records                                                   18
Total Features                                                  31
Memory Usage (MB)                                          0.02392
Duplicate Rows                                                   0
Numeric Features                                                18
Categorical Features                                            12
DateTime Features                                                1
Date Range              1997-11-18 00:00:00 to 2019-11-20 00:00:00
dtype: object


#### Saving Analysis Results 

In [35]:

import os
os.makedirs('../data/final', exist_ok=True)

# Create franchise indicator (based on belongs_to_collection)
df['is_franchise'] = df['belongs_to_collection'].notna().astype(int)

# Save processed DataFrame with calculated metrics
df.to_csv('../data/final/movies_with_kpis.csv', index=False)

# summary report
os.makedirs('../reports', exist_ok=True)

with open('../reports/kpi_summary.txt', 'w') as f:
    f.write("TMDB MOVIE ANALYSIS - KPI SUMMARY\n")
    f.write("=" * 80 + "\n\n")
    f.write(f"Analysis Date: {pd.Timestamp.now()}\n")
    f.write(f"Total Movies Analyzed: {len(df)}\n\n")
    f.write(f"Franchise Movies: {df['is_franchise'].sum()}\n")
    f.write(f"Standalone Movies: {(df['is_franchise'] == 0).sum()}\n\n")
    
    # Financial metrics
    f.write("FINANCIAL METRICS:\n")
    f.write(f"Average Budget: ${df['budget_musd'].mean():.2f}M\n")
    f.write(f"Average Revenue: ${df['revenue_musd'].mean():.2f}M\n")
    f.write(f"Average ROI: {df['roi'].mean():.2f}%\n")
    f.write(f"Profitable Movies: {df['is_profitable'].sum()}\n\n")
    
    # Quality metrics
    f.write("QUALITY METRICS:\n")
    f.write(f"Average Rating: {df['vote_average'].mean():.2f}/10\n")
    f.write(f"Average Runtime: {df['runtime'].mean():.0f} minutes\n")
    f.write(f"Average Popularity: {df['popularity'].mean():.2f}\n")

print("Data saved successfully!")
print(f"Location: ../data/final/movies_with_kpis.csv")
print(f"Report: ../reports/kpi_summary.txt")

Data saved successfully!
Location: ../data/final/movies_with_kpis.csv
Report: ../reports/kpi_summary.txt


In [36]:
data = pd.read_csv('../data/final/movies_with_kpis.csv')
data.head()

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,release_year,release_month,release_quarter,days_since_release,roi,profit_musd,genre_count,is_profitable,completeness_score,is_franchise
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure|Science Fiction|Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,...,2019,4,2,2421,686.359298,2443.4391,3,1,1.0,1
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment|Lightstorm Entertainment|20...,...,2009,12,4,5838,1133.631235,2686.706026,4,1,1.0,1
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure|Action|Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd.|Bad Robot,...,2015,12,4,3647,744.172908,1823.223624,3,1,1.0,1
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure|Action|Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,...,2018,4,2,2785,584.138346,1752.415039,3,1,1.0,1
4,597,Titanic,Nothing on earth could come between them.,1997-11-18,Drama|Romance,,en,200.0,2264.162353,Paramount Pictures|20th Century Fox|Lightstorm...,...,1997,11,4,10248,1032.081177,2064.162353,2,1,0.966667,0
