# Data Exploration and Feature Engineering

In [100]:
import pandas as pd
import json
import re
import numpy as np

In [101]:
df = pd.read_csv("../data/dataset.csv")

# Filter out adult films
df = df[df['adult'] == False]

# Remove movies with zero revenue OR zero budget
df = df[(df['revenue'] > 0) & (df['budget'] > 0)]

print(f"Dataset shape (after removing adult films and zero revenue/budget movies): {df.shape}")
df.head()

Dataset shape (after removing adult films and zero revenue/budget movies): (4767, 14)


Unnamed: 0,id,imdb_id,title,release_date,runtime,budget,revenue,adult,spoken_languages,genres,production_companies,production_countries,keywords,release_year
0,293660,tt1431045,Deadpool,2016-02-09,108,58000000,783100000,False,English,"Action, Adventure, Comedy","20th Century Fox, The Donners' Company, Genre ...",United States of America,"superhero, anti hero, mercenary, based on comi...",2016
1,299536,tt4154756,Avengers: Infinity War,2018-04-25,149,300000000,2052415039,False,"English, Xhosa","Adventure, Action, Science Fiction",Marvel Studios,United States of America,"sacrifice, magic, superhero, based on comic, s...",2018
2,299534,tt4154796,Avengers: Endgame,2019-04-24,181,356000000,2800000000,False,"English, Japanese, Xhosa","Adventure, Science Fiction, Action",Marvel Studios,United States of America,"superhero, time travel, space travel, time mac...",2019
3,475557,tt7286456,Joker,2019-10-01,122,55000000,1074458282,False,English,"Crime, Thriller, Drama","Warner Bros. Pictures, Joint Effort, Village R...","Canada, United States of America","dream, street gang, society, psychopath, clown...",2019
4,99861,tt2395427,Avengers: Age of Ultron,2015-04-22,141,365000000,1405403694,False,English,"Action, Adventure, Science Fiction",Marvel Studios,United States of America,"artificial intelligence (a.i.), superhero, bas...",2015


## Basic Data Info

In [102]:
df.info()

print("\nMissing values:")
missing = df.isnull().sum()
missing[missing > 0]

<class 'pandas.core.frame.DataFrame'>
Index: 4767 entries, 0 to 426698
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    4767 non-null   int64 
 1   imdb_id               2719 non-null   object
 2   title                 4767 non-null   object
 3   release_date          4767 non-null   object
 4   runtime               4767 non-null   int64 
 5   budget                4767 non-null   int64 
 6   revenue               4767 non-null   int64 
 7   adult                 4767 non-null   bool  
 8   spoken_languages      3803 non-null   object
 9   genres                4472 non-null   object
 10  production_companies  3702 non-null   object
 11  production_countries  3644 non-null   object
 12  keywords              3268 non-null   object
 13  release_year          4767 non-null   int64 
dtypes: bool(1), int64(5), object(8)
memory usage: 526.0+ KB

Missing values:


imdb_id                 2048
spoken_languages         964
genres                   295
production_companies    1065
production_countries    1123
keywords                1499
dtype: int64

## Production Companies Analysis

In [103]:
# Check production companies format
df['production_companies'].head(10)

0    20th Century Fox, The Donners' Company, Genre ...
1                                       Marvel Studios
2                                       Marvel Studios
3    Warner Bros. Pictures, Joint Effort, Village R...
4                                       Marvel Studios
5                                       Marvel Studios
6                                       Marvel Studios
7                                       Marvel Studios
8    Kennedy Miller Productions, Warner Bros. Pictu...
9    Marvel Studios, Pascal Pictures, LStar Capital...
Name: production_companies, dtype: object

In [104]:
# Define major studios and subsidiaries
major_studios = {
    'Disney': ['Disney', 'Marvel Studios', 'Lucasfilm', 'Pixar', 'Touchstone'],
    'Warner Bros': ['Warner Bros', 'New Line Cinema', 'DC Films'],
    'Universal': ['Universal', 'Focus Features', 'Illumination'],
    'Sony': ['Sony', 'Columbia', 'Screen Gems', 'TriStar'],
    'Paramount': ['Paramount', 'MTV Films'],
    'Fox': ['20th Century Fox', 'Fox Searchlight', 'Blue Sky'],
    'MGM': ['MGM', 'United Artists'],
    'Lionsgate': ['Lionsgate', 'Summit Entertainment']
}

def get_primary_studio(companies):
    if pd.isna(companies):
        return 'Unknown'
    
    companies_lower = companies.lower()
    for studio, keywords in major_studios.items():
        for keyword in keywords:
            if keyword.lower() in companies_lower:
                return studio
    
    first_company = companies.split(',')[0].strip()
    return first_company if first_company else 'Unknown'

df['primary_studio'] = df['production_companies'].apply(get_primary_studio)

print("Top studios:")
print(df['primary_studio'].value_counts().head(10))

Top studios:
primary_studio
Unknown             1065
Universal            164
Warner Bros          137
Sony                 117
Lionsgate             87
Disney                80
Paramount             76
Fox                   69
Queens Of Combat      25
Austmon               14
Name: count, dtype: int64


In [105]:
# Studio boolean flags
df['is_disney'] = df['production_companies'].str.contains('Disney|Marvel Studios|Lucasfilm|Pixar|Touchstone', case=False, na=False).astype(int)
df['is_warner_bros'] = df['production_companies'].str.contains('Warner Bros|New Line Cinema|DC Films', case=False, na=False).astype(int)
df['is_universal'] = df['production_companies'].str.contains('Universal|Focus Features|Illumination', case=False, na=False).astype(int)
df['is_sony'] = df['production_companies'].str.contains('Sony|Columbia|Screen Gems|TriStar', case=False, na=False).astype(int)
df['is_paramount'] = df['production_companies'].str.contains('Paramount|MTV Films', case=False, na=False).astype(int)
df['is_fox'] = df['production_companies'].str.contains('20th Century Fox|Fox Searchlight|Blue Sky', case=False, na=False).astype(int)
df['is_mgm'] = df['production_companies'].str.contains('MGM|United Artists', case=False, na=False).astype(int)
df['is_lionsgate'] = df['production_companies'].str.contains('Lionsgate|Summit Entertainment', case=False, na=False).astype(int)

df['is_major_studio'] = (df['is_disney'] | df['is_warner_bros'] | df['is_universal'] | 
                         df['is_sony'] | df['is_paramount'] | df['is_fox'] | 
                         df['is_mgm'] | df['is_lionsgate']).astype(int)

print("Studio counts:")
print(f"Disney: {df['is_disney'].sum()}")
print(f"Warner Bros: {df['is_warner_bros'].sum()}")
print(f"Universal: {df['is_universal'].sum()}")
print(f"Sony: {df['is_sony'].sum()}")
print(f"Paramount: {df['is_paramount'].sum()}")
print(f"Fox: {df['is_fox'].sum()}")
print(f"MGM: {df['is_mgm'].sum()}")
print(f"Lionsgate: {df['is_lionsgate'].sum()}")
print(f"Major studio total: {df['is_major_studio'].sum()} ({df['is_major_studio'].mean()*100:.1f}%)")

Studio counts:
Disney: 80
Warner Bros: 137
Universal: 167
Sony: 122
Paramount: 78
Fox: 71
MGM: 1
Lionsgate: 91
Major studio total: 730 (15.3%)


## Genre Analysis

In [106]:
# Examine all unique genres
print("Exploring all unique genres in the dataset:")

# Split and explode genres to get individual genre counts
genres_exploded = df['genres'].dropna().str.split(',').explode().str.strip()
unique_genres = genres_exploded.value_counts()

print(f"Total unique genres: {len(unique_genres)}")
print(f"Total genre entries (including duplicates): {len(genres_exploded)}")
print("\nAll genres sorted by frequency:")
print(unique_genres)

Exploring all unique genres in the dataset:
Total unique genres: 19
Total genre entries (including duplicates): 10159

All genres sorted by frequency:
genres
Drama              1883
Comedy             1388
Action             1004
Thriller            891
Horror              644
Adventure           621
Crime               451
Romance             441
Documentary         416
Mystery             400
Family              374
Fantasy             373
Science Fiction     361
Animation           268
History             215
Music               212
War                 138
Western              43
TV Movie             36
Name: count, dtype: int64


In [107]:
# One-hot encode all genres
print("Creating one-hot encoded features for all genres:")

# Get all unique genres
genres_exploded = df['genres'].dropna().str.split(',').explode().str.strip()
all_genres = genres_exploded.value_counts().index.tolist()

# Create boolean flag for each genre
genre_columns = []
for genre in all_genres:
    col_name = f'genre_{genre.lower().replace(" ", "_").replace("-", "_")}'
    df[col_name] = df['genres'].str.contains(genre, case=False, na=False).astype(int)
    genre_columns.append(col_name)

print(f"Created {len(genre_columns)} genre features:")
print("\nGenre feature counts:")
for i, col in enumerate(genre_columns):
    count = df[col].sum()
    original_genre = all_genres[i]
    print(f"{original_genre}: {count}")
    
print(f"\nGenre columns created: {genre_columns[:10]}..." if len(genre_columns) > 10 else f"\nGenre columns created: {genre_columns}")

Creating one-hot encoded features for all genres:
Created 19 genre features:

Genre feature counts:
Drama: 1882
Comedy: 1388
Action: 1004
Thriller: 891
Horror: 644
Adventure: 621
Crime: 451
Romance: 441
Documentary: 416
Mystery: 400
Family: 374
Fantasy: 373
Science Fiction: 361
Animation: 268
History: 215
Music: 212
War: 138
Western: 43
TV Movie: 36

Genre columns created: ['genre_drama', 'genre_comedy', 'genre_action', 'genre_thriller', 'genre_horror', 'genre_adventure', 'genre_crime', 'genre_romance', 'genre_documentary', 'genre_mystery']...


In [108]:
# Genre combinations using new one-hot encoded features
df['is_action_adventure'] = (df['genre_action'] & df['genre_adventure']).astype(int)
df['is_action_sci_fi'] = (df['genre_action'] & df['genre_science_fiction']).astype(int)
df['is_comedy_romance'] = (df['genre_comedy'] & df['genre_romance']).astype(int)
df['is_family_animation'] = (df['genre_family'] & df['genre_animation']).astype(int)

print("Popular genre combinations:")
print(f"Action + Adventure: {df['is_action_adventure'].sum()}")
print(f"Action + Sci-Fi: {df['is_action_sci_fi'].sum()}")
print(f"Comedy + Romance: {df['is_comedy_romance'].sum()}")
print(f"Family + Animation: {df['is_family_animation'].sum()}")

Popular genre combinations:
Action + Adventure: 303
Action + Sci-Fi: 182
Comedy + Romance: 180
Family + Animation: 140


In [109]:
# Primary genre and genre count
df['primary_genre'] = df['genres'].str.split(',').str[0].fillna('Unknown')
df['genre_count'] = df['genres'].str.count(',').fillna(-1) + 1

print("Top genres:")
print(df['primary_genre'].value_counts().head(10))
print(f"\nAverage genres per movie: {df['genre_count'].mean():.2f}")
print(f"Max genres in one movie: {df['genre_count'].max()}")

Top genres:
primary_genre
Drama          989
Comedy         744
Action         607
Horror         400
Documentary    333
Unknown        295
Thriller       241
Animation      162
Adventure      160
Romance        139
Name: count, dtype: int64

Average genres per movie: 2.13
Max genres in one movie: 11.0


## Country Analysis

In [110]:
# Primary country and country flags
df['primary_country'] = df['production_countries'].str.split(',').str[0].fillna('Unknown')
df['is_usa'] = df['production_countries'].str.contains('United States', case=False, na=False).astype(int)
df['is_uk'] = df['production_countries'].str.contains('United Kingdom|UK', case=False, na=False).astype(int)
df['is_us_uk'] = (df['is_usa'] | df['is_uk']).astype(int)

print("Top countries:")
print(df['primary_country'].value_counts().head(10))
print(f"\nCountry flags:")
print(f"US movies: {df['is_usa'].sum()} ({df['is_usa'].mean()*100:.1f}%)")
print(f"UK movies: {df['is_uk'].sum()} ({df['is_uk'].mean()*100:.1f}%)")
print(f"US/UK movies: {df['is_us_uk'].sum()} ({df['is_us_uk'].mean()*100:.1f}%)")

Top countries:
primary_country
United States of America    1214
Unknown                     1123
India                        294
United Kingdom               253
Brazil                       187
Canada                       166
France                       149
China                         97
Spain                         87
Australia                     80
Name: count, dtype: int64

Country flags:
US movies: 1688 (35.4%)
UK movies: 415 (8.7%)
US/UK movies: 1899 (39.8%)


In [111]:
# Language analysis
df['is_english'] = df['spoken_languages'].str.contains('English', case=False, na=False).astype(int)
df['language_count'] = df['spoken_languages'].str.count(',').fillna(-1) + 1
df['is_multilingual'] = (df['language_count'] > 1).astype(int)

print("Language distribution:")
print(f"English movies: {df['is_english'].sum()} ({df['is_english'].mean()*100:.1f}%)")
print(f"Multilingual movies: {df['is_multilingual'].sum()} ({df['is_multilingual'].mean()*100:.1f}%)")
print(f"Average languages per movie: {df['language_count'].mean():.2f}")

Language distribution:
English movies: 2352 (49.3%)
Multilingual movies: 825 (17.3%)
Average languages per movie: 1.08


## Sequel and IP Detection

In [112]:
# Sequel detection (US/UK movies only, excluding documentaries)
sequel_patterns = r'\b(?:2|3|4|5|II|III|IV|V|Part|Chapter|Episode|Returns|Rises|Begins)\b|:'
df['is_sequel_title'] = df['title'].str.contains(sequel_patterns, case=False, na=False).astype(int)

# Only count franchise movies if they are US/UK productions and NOT documentaries
us_uk_mask = df['is_us_uk'] == 1
not_documentary_mask = ~df['genres'].str.contains('Documentary', case=False, na=False)
franchise_mask = us_uk_mask & not_documentary_mask

# Exact list of legitimate Marvel films
marvel_movies = [
    # Phase One
    "Iron Man",
    "The Incredible Hulk", 
    "Iron Man 2",
    "Thor",
    "Captain America: The First Avenger",
    "The Avengers",
    
    # Phase Two
    "Iron Man 3",
    "Thor: The Dark World",
    "Captain America: The Winter Soldier",
    "Guardians of the Galaxy",
    "Avengers: Age of Ultron",
    "Ant-Man",
    
    # Phase Three
    "Captain America: Civil War",
    "Doctor Strange",
    "Guardians of the Galaxy Vol. 2",
    "Spider-Man: Homecoming",
    "Thor: Ragnarok",
    "Black Panther",
    "Avengers: Infinity War",
    "Ant-Man and the Wasp",
    "Captain Marvel",
    "Avengers: Endgame",
    "Spider-Man: Far From Home",
    
    # Phase Four
    "Black Widow",
    "Shang-Chi and the Legend of the Ten Rings",
    "Eternals",
    "Spider-Man: No Way Home",
    "Doctor Strange in the Multiverse of Madness",
    "Thor: Love and Thunder",
    "Black Panther: Wakanda Forever",
    
    # Phase Five
    "Ant-Man and the Wasp: Quantumania",
    "Guardians of the Galaxy Vol. 3",
    "The Marvels",
    "Deadpool & Wolverine",
    "Captain America: Brave New World",
    "Thunderbolts*",
    "The Fantastic Four: First Steps",
    
    # Phase Six and Beyond
    "Spider-Man: Brand New Day",
    "Avengers: Doomsday",
    "Avengers: Secret Wars",

]

df['is_marvel'] = (df['title'].isin(marvel_movies) & franchise_mask).astype(int)
df['is_dc'] = (df['title'].str.contains('Batman|Superman|Wonder Woman|Aquaman|Justice League', case=False, na=False) & franchise_mask).astype(int)

# Exact list of legitimate Star Wars films
star_wars_titles = [
    "Star Wars: Episode IV - A New Hope",
    "Star Wars: Episode V - The Empire Strikes Back", 
    "Star Wars: Episode VI - Return of the Jedi",
    "Star Wars: Episode I - The Phantom Menace",
    "Star Wars: Episode II - Attack of the Clones",
    "Star Wars: Episode III - Revenge of the Sith",
    "Star Wars: Episode VII - The Force Awakens",
    "Star Wars: Episode VIII - The Last Jedi",
    "Star Wars: Episode IX - The Rise of Skywalker",
    "Rogue One: A Star Wars Story",
    "Solo: A Star Wars Story",
    "The Mandalorian & Grogu",
    "Star Wars: Starfighter",
    "Star Wars: New Jedi Order",
    "Dave Filoni's New Republic Film",
    "Simon Kinberg's New Trilogy"
]

df['is_star_wars'] = (df['title'].isin(star_wars_titles) & franchise_mask).astype(int)

df['is_fast_furious'] = (df['title'].str.contains('Fast|Furious', case=False, na=False) & franchise_mask).astype(int)
df['is_harry_potter'] = (df['title'].str.contains('Harry Potter|Hogwarts|Fantastic Beasts', case=False, na=False) & franchise_mask).astype(int)

df['is_sequel'] = (df['is_sequel_title'] | df['is_marvel'] | df['is_dc'] | df['is_star_wars'] | df['is_fast_furious'] | df['is_harry_potter']).astype(int)

print("Sequel analysis (US/UK movies only, excluding documentaries):")
print(f"Title sequel indicators: {df['is_sequel_title'].sum()}")
print(f"Marvel movies: {df['is_marvel'].sum()}")
print(f"DC movies: {df['is_dc'].sum()}")
print(f"Star Wars movies: {df['is_star_wars'].sum()}")
print(f"Fast & Furious movies: {df['is_fast_furious'].sum()}")
print(f"Harry Potter movies: {df['is_harry_potter'].sum()}")
print(f"Total sequels/franchise: {df['is_sequel'].sum()} ({df['is_sequel'].mean()*100:.1f}%)")

Sequel analysis (US/UK movies only, excluding documentaries):
Title sequel indicators: 662
Marvel movies: 22
DC movies: 12
Star Wars movies: 2
Fast & Furious movies: 5
Harry Potter movies: 4
Total sequels/franchise: 685 (14.4%)


In [113]:
# Show all movies marked as marvel
marvel_movies = df[df['is_marvel'] == 1][['title', 'release_date', 'primary_country', 'revenue', 'budget']].sort_values('release_date')

print(f"All {len(marvel_movies)} movies marked as Marvel:")
print(marvel_movies.to_string(index=False))

All 22 movies marked as Marvel:
                                      title release_date          primary_country    revenue    budget
                    Avengers: Age of Ultron   2015-04-22 United States of America 1405403694 365000000
                                    Ant-Man   2015-07-14 United States of America  519311965 130000000
                 Captain America: Civil War   2016-04-27 United States of America 1155046416 250000000
                             Doctor Strange   2016-10-25 United States of America  676343174 180000000
             Guardians of the Galaxy Vol. 2   2017-04-19 United States of America  863756051 200000000
                     Spider-Man: Homecoming   2017-07-05 United States of America  880166924 175000000
                             Thor: Ragnarok   2017-10-02 United States of America  855301806 180000000
                              Black Panther   2018-02-13 United States of America 1349926083 200000000
                     Avengers: Infinity W

In [114]:
# Revenue analysis
has_revenue = df['revenue'] > 0
print(f"Movies with revenue data: {has_revenue.sum()} out of {len(df)} ({has_revenue.mean()*100:.1f}%)")

print("\nTop 10 highest grossing movies:")
top_movies = df[has_revenue].nlargest(10, 'revenue')[['title', 'revenue', 'budget', 'primary_studio']]
print(top_movies.to_string())

print("\nRevenue by major studios:")
major_studio_names = ['Disney', 'Warner Bros', 'Universal', 'Sony', 'Paramount', 'Fox', 'MGM', 'Lionsgate']
major_studio_movies = df[(df['revenue'] > 0) & (df['primary_studio'].isin(major_studio_names))]

if len(major_studio_movies) > 0:
    revenue_by_studio = major_studio_movies.groupby('primary_studio')['revenue'].agg(['mean', 'count']).sort_values('mean', ascending=False)
    print(revenue_by_studio)
else:
    print("No major studio movies with revenue data found")

Movies with revenue data: 4767 out of 4767 (100.0%)

Top 10 highest grossing movies:
                               title     revenue     budget        primary_studio
99038              TikTok Rizz Party  3000000000  250000000           Warner Bros
123154       Adventures in Bora Bora  3000000000  800000000               Unknown
2                  Avengers: Endgame  2800000000  356000000                Disney
77          Avatar: The Way of Water  2320250281  460000000  20th Century Studios
18      Star Wars: The Force Awakens  2068223624  245000000                Disney
1             Avengers: Infinity War  2052415039  300000000                Disney
229307                      IPL 2025  2000000000  312000000           TATA Studio
238517                   Best Of Joy  2000000000       1000            Siba Media
19           Spider-Man: No Way Home  1921847111  200000000                Disney
14                    Jurassic World  1671537444  150000000             Universal

Revenue by m

## Final Dataset Summary

In [None]:
# Feature summary
binary_features = [col for col in df.columns if col.startswith('is_')]
print(f"Dataset shape: {df.shape}")
print(f"Binary features created: {len(binary_features)}")

print("\nFeature summary:")
for feature in binary_features:
    count = df[feature].sum()
    if count > 50:
        print(f"{feature}: {count} ({count/len(df)*100:.1f}%)")

# Save processed dataset
df.to_csv("../data/dataset_processed.csv", index=False)
print(f"\nProcessed dataset saved with {df.shape[1]} features")