# Data Exploration and Feature Engineering

In [44]:
import pandas as pd
import json
import re
import numpy as np

In [45]:
df = pd.read_csv("../data/dataset.csv")

print(f"Dataset shape): {df.shape}")
df.head()

Dataset shape): (2241, 14)


Unnamed: 0,id,imdb_id,title,release_date,runtime,budget,revenue,adult,spoken_languages,genres,production_companies,production_countries,keywords,release_year
0,293660,tt1431045,Deadpool,2016-02-09,108,58000000,783100000,False,English,"Action, Adventure, Comedy","20th Century Fox, The Donners' Company, Genre ...",United States of America,"superhero, anti hero, mercenary, based on comi...",2016
1,299536,tt4154756,Avengers: Infinity War,2018-04-25,149,300000000,2052415039,False,"English, Xhosa","Adventure, Action, Science Fiction",Marvel Studios,United States of America,"sacrifice, magic, superhero, based on comic, s...",2018
2,299534,tt4154796,Avengers: Endgame,2019-04-24,181,356000000,2800000000,False,"English, Japanese, Xhosa","Adventure, Science Fiction, Action",Marvel Studios,United States of America,"superhero, time travel, space travel, time mac...",2019
3,475557,tt7286456,Joker,2019-10-01,122,55000000,1074458282,False,English,"Crime, Thriller, Drama","Warner Bros. Pictures, Joint Effort, Village R...","Canada, United States of America","dream, street gang, society, psychopath, clown...",2019
4,99861,tt2395427,Avengers: Age of Ultron,2015-04-22,141,365000000,1405403694,False,English,"Action, Adventure, Science Fiction",Marvel Studios,United States of America,"artificial intelligence (a.i.), superhero, bas...",2015


## Top 10 Highest Growssing Movies

In [46]:
print("Top 10 highest grossing movies:")
top_movies = df.nlargest(10, 'revenue')[['title', 'revenue', 'budget']]
display(top_movies)

Top 10 highest grossing movies:


Unnamed: 0,title,revenue,budget
1723,Adventures in Bora Bora,3000000000,800000000
2,Avengers: Endgame,2800000000,356000000
76,Avatar: The Way of Water,2320250281,460000000
18,Star Wars: The Force Awakens,2068223624,245000000
1,Avengers: Infinity War,2052415039,300000000
1986,IPL 2025,2000000000,312000000
2007,Best Of Joy,2000000000,1000
19,Spider-Man: No Way Home,1921847111,200000000
14,Jurassic World,1671537444,150000000
89,The Lion King,1663075401,260000000


## Production Companies Analysis

In [42]:
# Check production companies format
df['production_companies'].head(10)

0    20th Century Fox, The Donners' Company, Genre ...
1                                       Marvel Studios
2                                       Marvel Studios
3    Warner Bros. Pictures, Joint Effort, Village R...
4                                       Marvel Studios
5                                       Marvel Studios
6                                       Marvel Studios
7                                       Marvel Studios
8    Kennedy Miller Productions, Warner Bros. Pictu...
9    Marvel Studios, Pascal Pictures, LStar Capital...
Name: production_companies, dtype: object

In [None]:
# Define major studios and subsidiaries (ordered by revenue/market share, highest first)
major_studios = {
    'Disney': ['Disney', 'Marvel Studios', 'Lucasfilm', 'Pixar', 'Touchstone'],
    'Warner Bros': ['Warner Bros', 'DC Films'],
    'Fox': ['20th Century Fox', 'Blue Sky', '20th Century Studios'],
    'Universal': ['Universal', 'Illumination'],
    'Paramount': ['Paramount', 'MTV Films'],
    'Sony': ['Sony', 'Columbia', 'Screen Gems', 'TriStar'],
    'MGM': ['MGM', 'United Artists']
}

# Define mid-tier studios (checked after major studios, includes subsidiaries with distinct branding)
midtier_studios = {
    'Lionsgate': ['Lionsgate', 'Summit Entertainment'],
    'A24': ['A24'],
    'Focus Features': ['Focus Features'],
    'Searchlight Pictures': ['Searchlight Pictures', 'Fox Searchlight'],
    'New Line Cinema': ['New Line Cinema'],
    'Blumhouse': ['Blumhouse Productions', 'Blumhouse'],
    'STX Films': ['STX Films', 'STX Entertainment'],
    'Annapurna': ['Annapurna Pictures'],
    'NEON': ['NEON'],
    'Amazon Studios': ['Amazon Studios'],
    'Netflix': ['Netflix', 'Netflix International Pictures'],
    'Apple': ['Apple TV+', 'Apple Studios'],
    'Legendary': ['Legendary Pictures', 'Legendary Entertainment'],
    'The Weinstein Company': ['The Weinstein Company', 'Weinstein'],
    'DreamWorks': ['DreamWorks Pictures', 'DreamWorks'],
    'Scott Free': ['Scott Free Productions'],
    'Plan B': ['Plan B Entertainment'],
    'Participant': ['Participant', 'Participant Media']
}

def get_primary_studio(companies):
    if pd.isna(companies):
        return 'Unknown'
    
    companies_lower = companies.lower()
    
    # Check major studios first
    for studio, keywords in major_studios.items():
        for keyword in keywords:
            if keyword.lower() in companies_lower:
                return studio
    
    # Check mid-tier studios second
    for studio, keywords in midtier_studios.items():
        for keyword in keywords:
            if keyword.lower() in companies_lower:
                return studio
    
    # Fallback to first company
    first_company = companies.split(',')[0].strip()
    return first_company if first_company else 'Unknown'

df['primary_studio'] = df['production_companies'].apply(get_primary_studio)

print("Top studios:")
print(df['primary_studio'].value_counts().head(50))

In [38]:
# Filter for A24 movies (anywhere in production companies)
a24_movies = df[df['production_companies'].str.contains('A24', case=False, na=False)]

print(f"A24 movies ({len(a24_movies)}):")
print(a24_movies[['title', 'release_date', 'revenue', 'budget', 'genres']].to_string(index=False))

A24 movies (16):
                            title release_date   revenue    budget                             genres
                        Moonlight   2016-10-21  65046687   4000000                              Drama
Everything Everywhere All at Once   2022-03-24 139200000  25000000 Action, Adventure, Science Fiction
                   The Lighthouse   2019-10-18  18129854  11000000   Drama, Fantasy, Thriller, Horror
                       Uncut Gems   2019-08-30  50020902  19000000             Drama, Thriller, Crime
                        The Whale   2022-12-09  55101305   3000000                              Drama
                                X   2022-03-17  14779858  10000000                             Horror
                It Comes at Night   2017-06-09  19280931   5000000             Drama, Horror, Mystery
                 The Green Knight   2021-07-29  18888418  15000000          Adventure, Drama, Fantasy
                           mid90s   2018-10-19   6897446    25815

In [22]:
# Studio boolean flags
df['is_disney'] = df['production_companies'].str.contains('Disney|Marvel Studios|Lucasfilm|Pixar|Touchstone', case=False, na=False).astype(int)
df['is_warner_bros'] = df['production_companies'].str.contains('Warner Bros|New Line Cinema|DC Films', case=False, na=False).astype(int)
df['is_universal'] = df['production_companies'].str.contains('Universal|Focus Features|Illumination', case=False, na=False).astype(int)
df['is_sony'] = df['production_companies'].str.contains('Sony|Columbia|Screen Gems|TriStar', case=False, na=False).astype(int)
df['is_paramount'] = df['production_companies'].str.contains('Paramount|MTV Films', case=False, na=False).astype(int)
df['is_fox'] = df['production_companies'].str.contains('20th Century Fox|Fox Searchlight|Blue Sky', case=False, na=False).astype(int)
df['is_mgm'] = df['production_companies'].str.contains('MGM|United Artists', case=False, na=False).astype(int)
df['is_lionsgate'] = df['production_companies'].str.contains('Lionsgate|Summit Entertainment', case=False, na=False).astype(int)

df['is_major_studio'] = (df['is_disney'] | df['is_warner_bros'] | df['is_universal'] | 
                         df['is_sony'] | df['is_paramount'] | df['is_fox'] | 
                         df['is_mgm'] | df['is_lionsgate']).astype(int)

print("Studio counts:")
print(f"Disney: {df['is_disney'].sum()}")
print(f"Warner Bros: {df['is_warner_bros'].sum()}")
print(f"Universal: {df['is_universal'].sum()}")
print(f"Sony: {df['is_sony'].sum()}")
print(f"Paramount: {df['is_paramount'].sum()}")
print(f"Fox: {df['is_fox'].sum()}")
print(f"MGM: {df['is_mgm'].sum()}")
print(f"Lionsgate: {df['is_lionsgate'].sum()}")
print(f"Major studio total: {df['is_major_studio'].sum()} ({df['is_major_studio'].mean()*100:.1f}%)")

Studio counts:
Disney: 71
Warner Bros: 123
Universal: 155
Sony: 110
Paramount: 72
Fox: 67
MGM: 1
Lionsgate: 91
Major studio total: 676 (28.7%)


## Genre Analysis

In [23]:
# Examine all unique genres
print("Exploring all unique genres in the dataset:")

# Split and explode genres to get individual genre counts
genres_exploded = df['genres'].dropna().str.split(',').explode().str.strip()
unique_genres = genres_exploded.value_counts()

print(f"Total unique genres: {len(unique_genres)}")
print(f"Total genre entries (including duplicates): {len(genres_exploded)}")
print("\nAll genres sorted by frequency:")
print(unique_genres)

Exploring all unique genres in the dataset:
Total unique genres: 19
Total genre entries (including duplicates): 5701

All genres sorted by frequency:
genres
Drama              963
Comedy             707
Action             603
Thriller           543
Adventure          404
Horror             385
Crime              267
Science Fiction    247
Mystery            231
Fantasy            220
Family             218
Romance            213
Documentary        169
Animation          167
History            142
Music              104
War                 74
Western             31
TV Movie            13
Name: count, dtype: int64


In [24]:
# One-hot encode all genres
print("Creating one-hot encoded features for all genres:")

# Get all unique genres
genres_exploded = df['genres'].dropna().str.split(',').explode().str.strip()
all_genres = genres_exploded.value_counts().index.tolist()

# Create boolean flag for each genre
genre_columns = []
for genre in all_genres:
    col_name = f'genre_{genre.lower().replace(" ", "_").replace("-", "_")}'
    df[col_name] = df['genres'].str.contains(genre, case=False, na=False).astype(int)
    genre_columns.append(col_name)

print(f"Created {len(genre_columns)} genre features:")
print("\nGenre feature counts:")
for i, col in enumerate(genre_columns):
    count = df[col].sum()
    original_genre = all_genres[i]
    print(f"{original_genre}: {count}")
    
print(f"\nGenre columns created: {genre_columns[:10]}..." if len(genre_columns) > 10 else f"\nGenre columns created: {genre_columns}")

Creating one-hot encoded features for all genres:
Created 19 genre features:

Genre feature counts:
Drama: 963
Comedy: 707
Action: 603
Thriller: 543
Adventure: 404
Horror: 385
Crime: 267
Science Fiction: 247
Mystery: 231
Fantasy: 220
Family: 218
Romance: 213
Documentary: 169
Animation: 167
History: 142
Music: 104
War: 74
Western: 31
TV Movie: 13

Genre columns created: ['genre_drama', 'genre_comedy', 'genre_action', 'genre_thriller', 'genre_adventure', 'genre_horror', 'genre_crime', 'genre_science_fiction', 'genre_mystery', 'genre_fantasy']...


In [25]:
# Genre combinations using new one-hot encoded features
df['is_action_adventure'] = (df['genre_action'] & df['genre_adventure']).astype(int)
df['is_action_sci_fi'] = (df['genre_action'] & df['genre_science_fiction']).astype(int)
df['is_comedy_romance'] = (df['genre_comedy'] & df['genre_romance']).astype(int)
df['is_family_animation'] = (df['genre_family'] & df['genre_animation']).astype(int)

print("Popular genre combinations:")
print(f"Action + Adventure: {df['is_action_adventure'].sum()}")
print(f"Action + Sci-Fi: {df['is_action_sci_fi'].sum()}")
print(f"Comedy + Romance: {df['is_comedy_romance'].sum()}")
print(f"Family + Animation: {df['is_family_animation'].sum()}")

Popular genre combinations:
Action + Adventure: 210
Action + Sci-Fi: 144
Comedy + Romance: 84
Family + Animation: 106


In [26]:
# Primary genre and genre count
df['primary_genre'] = df['genres'].str.split(',').str[0].fillna('Unknown')
df['genre_count'] = df['genres'].str.count(',').fillna(-1) + 1

print("Top genres:")
print(df['primary_genre'].value_counts().head(10))
print(f"\nAverage genres per movie: {df['genre_count'].mean():.2f}")
print(f"Max genres in one movie: {df['genre_count'].max()}")

Top genres:
primary_genre
Drama              486
Action             359
Comedy             353
Horror             238
Documentary        124
Thriller           122
Animation           95
Adventure           92
Science Fiction     74
Crime               69
Name: count, dtype: int64

Average genres per movie: 2.42
Max genres in one movie: 9.0


## Country Analysis

In [27]:
# Primary country and country flags
df['primary_country'] = df['production_countries'].str.split(',').str[0].fillna('Unknown')
df['is_usa'] = df['production_countries'].str.contains('United States', case=False, na=False).astype(int)
df['is_uk'] = df['production_countries'].str.contains('United Kingdom|UK', case=False, na=False).astype(int)
df['is_us_uk'] = (df['is_usa'] | df['is_uk']).astype(int)

print("Top countries:")
print(df['primary_country'].value_counts().head(10))
print(f"\nCountry flags:")
print(f"US movies: {df['is_usa'].sum()} ({df['is_usa'].mean()*100:.1f}%)")
print(f"UK movies: {df['is_uk'].sum()} ({df['is_uk'].mean()*100:.1f}%)")
print(f"US/UK movies: {df['is_us_uk'].sum()} ({df['is_us_uk'].mean()*100:.1f}%)")

Top countries:
primary_country
United States of America    1060
Unknown                      288
United Kingdom               212
Canada                       145
India                         67
China                         63
Australia                     62
France                        51
Brazil                        33
Belgium                       32
Name: count, dtype: int64

Country flags:
US movies: 1515 (64.4%)
UK movies: 351 (14.9%)
US/UK movies: 1669 (71.0%)


In [28]:
# Language analysis
df['is_english'] = df['spoken_languages'].str.contains('English', case=False, na=False).astype(int)
df['language_count'] = df['spoken_languages'].str.count(',').fillna(-1) + 1
df['is_multilingual'] = (df['language_count'] > 1).astype(int)

print("Language distribution:")
print(f"English movies: {df['is_english'].sum()} ({df['is_english'].mean()*100:.1f}%)")
print(f"Multilingual movies: {df['is_multilingual'].sum()} ({df['is_multilingual'].mean()*100:.1f}%)")
print(f"Average languages per movie: {df['language_count'].mean():.2f}")

Language distribution:
English movies: 2352 (100.0%)
Multilingual movies: 696 (29.6%)
Average languages per movie: 1.49


## Sequel and IP Detection

In [29]:
# Sequel detection (US/UK movies only, excluding documentaries)
sequel_patterns = r'\b(?:2|3|4|5|II|III|IV|V|Part|Chapter|Episode|Returns|Rises|Begins)\b|:'
df['is_sequel_title'] = df['title'].str.contains(sequel_patterns, case=False, na=False).astype(int)

# Only count franchise movies if they are US/UK productions and NOT documentaries
us_uk_mask = df['is_us_uk'] == 1
not_documentary_mask = ~df['genres'].str.contains('Documentary', case=False, na=False)
franchise_mask = us_uk_mask & not_documentary_mask

# Exact list of legitimate Marvel films
marvel_movies = [
    # Phase One
    "Iron Man",
    "The Incredible Hulk", 
    "Iron Man 2",
    "Thor",
    "Captain America: The First Avenger",
    "The Avengers",
    
    # Phase Two
    "Iron Man 3",
    "Thor: The Dark World",
    "Captain America: The Winter Soldier",
    "Guardians of the Galaxy",
    "Avengers: Age of Ultron",
    "Ant-Man",
    
    # Phase Three
    "Captain America: Civil War",
    "Doctor Strange",
    "Guardians of the Galaxy Vol. 2",
    "Spider-Man: Homecoming",
    "Thor: Ragnarok",
    "Black Panther",
    "Avengers: Infinity War",
    "Ant-Man and the Wasp",
    "Captain Marvel",
    "Avengers: Endgame",
    "Spider-Man: Far From Home",
    
    # Phase Four
    "Black Widow",
    "Shang-Chi and the Legend of the Ten Rings",
    "Eternals",
    "Spider-Man: No Way Home",
    "Doctor Strange in the Multiverse of Madness",
    "Thor: Love and Thunder",
    "Black Panther: Wakanda Forever",
    
    # Phase Five
    "Ant-Man and the Wasp: Quantumania",
    "Guardians of the Galaxy Vol. 3",
    "The Marvels",
    "Deadpool & Wolverine",
    "Captain America: Brave New World",
    "Thunderbolts*",
    "The Fantastic Four: First Steps",
    
    # Phase Six and Beyond
    "Spider-Man: Brand New Day",
    "Avengers: Doomsday",
    "Avengers: Secret Wars",

]

df['is_marvel'] = (df['title'].isin(marvel_movies) & franchise_mask).astype(int)
df['is_dc'] = (df['title'].str.contains('Batman|Superman|Wonder Woman|Aquaman|Justice League', case=False, na=False) & franchise_mask).astype(int)

# Exact list of legitimate Star Wars films
star_wars_titles = [
    "Star Wars: Episode IV - A New Hope",
    "Star Wars: Episode V - The Empire Strikes Back", 
    "Star Wars: Episode VI - Return of the Jedi",
    "Star Wars: Episode I - The Phantom Menace",
    "Star Wars: Episode II - Attack of the Clones",
    "Star Wars: Episode III - Revenge of the Sith",
    "Star Wars: Episode VII - The Force Awakens",
    "Star Wars: Episode VIII - The Last Jedi",
    "Star Wars: Episode IX - The Rise of Skywalker",
    "Rogue One: A Star Wars Story",
    "Solo: A Star Wars Story",
    "The Mandalorian & Grogu",
    "Star Wars: Starfighter",
    "Star Wars: New Jedi Order",
    "Dave Filoni's New Republic Film",
    "Simon Kinberg's New Trilogy"
]

df['is_star_wars'] = (df['title'].isin(star_wars_titles) & franchise_mask).astype(int)

df['is_fast_furious'] = (df['title'].str.contains('Fast|Furious', case=False, na=False) & franchise_mask).astype(int)
df['is_harry_potter'] = (df['title'].str.contains('Harry Potter|Hogwarts|Fantastic Beasts', case=False, na=False) & franchise_mask).astype(int)

df['is_sequel'] = (df['is_sequel_title'] | df['is_marvel'] | df['is_dc'] | df['is_star_wars'] | df['is_fast_furious'] | df['is_harry_potter']).astype(int)

print("Sequel analysis (US/UK movies only, excluding documentaries):")
print(f"Title sequel indicators: {df['is_sequel_title'].sum()}")
print(f"Marvel movies: {df['is_marvel'].sum()}")
print(f"DC movies: {df['is_dc'].sum()}")
print(f"Star Wars movies: {df['is_star_wars'].sum()}")
print(f"Fast & Furious movies: {df['is_fast_furious'].sum()}")
print(f"Harry Potter movies: {df['is_harry_potter'].sum()}")
print(f"Total sequels/franchise: {df['is_sequel'].sum()} ({df['is_sequel'].mean()*100:.1f}%)")

Sequel analysis (US/UK movies only, excluding documentaries):
Title sequel indicators: 348
Marvel movies: 22
DC movies: 11
Star Wars movies: 2
Fast & Furious movies: 5
Harry Potter movies: 3
Total sequels/franchise: 370 (15.7%)


In [30]:
# Show all movies marked as marvel
marvel_movies = df[df['is_marvel'] == 1][['title', 'release_date', 'primary_country', 'revenue', 'budget']].sort_values('release_date')

print(f"All {len(marvel_movies)} movies marked as Marvel:")
print(marvel_movies.to_string(index=False))

All 22 movies marked as Marvel:
                                      title release_date          primary_country    revenue    budget
                    Avengers: Age of Ultron   2015-04-22 United States of America 1405403694 365000000
                                    Ant-Man   2015-07-14 United States of America  519311965 130000000
                 Captain America: Civil War   2016-04-27 United States of America 1155046416 250000000
                             Doctor Strange   2016-10-25 United States of America  676343174 180000000
             Guardians of the Galaxy Vol. 2   2017-04-19 United States of America  863756051 200000000
                     Spider-Man: Homecoming   2017-07-05 United States of America  880166924 175000000
                             Thor: Ragnarok   2017-10-02 United States of America  855301806 180000000
                              Black Panther   2018-02-13 United States of America 1349926083 200000000
                     Avengers: Infinity W

In [31]:
# Revenue analysis
has_revenue = df['revenue'] > 0
print(f"Movies with revenue data: {has_revenue.sum()} out of {len(df)} ({has_revenue.mean()*100:.1f}%)")

print("\nTop 10 highest grossing movies:")
top_movies = df[has_revenue].nlargest(10, 'revenue')[['title', 'revenue', 'budget', 'primary_studio']]
print(top_movies.to_string())

print("\nRevenue by major studios:")
major_studio_names = ['Disney', 'Warner Bros', 'Universal', 'Sony', 'Paramount', 'Fox', 'MGM', 'Lionsgate']
major_studio_movies = df[(df['revenue'] > 0) & (df['primary_studio'].isin(major_studio_names))]

if len(major_studio_movies) > 0:
    revenue_by_studio = major_studio_movies.groupby('primary_studio')['revenue'].agg(['mean', 'count']).sort_values('mean', ascending=False)
    print(revenue_by_studio)
else:
    print("No major studio movies with revenue data found")

Movies with revenue data: 2352 out of 2352 (100.0%)

Top 10 highest grossing movies:
                             title     revenue     budget        primary_studio
1804       Adventures in Bora Bora  3000000000  800000000               Unknown
2                Avengers: Endgame  2800000000  356000000                Disney
76        Avatar: The Way of Water  2320250281  460000000  20th Century Studios
18    Star Wars: The Force Awakens  2068223624  245000000                Disney
1           Avengers: Infinity War  2052415039  300000000                Disney
2082                      IPL 2025  2000000000  312000000           TATA Studio
2106                   Best Of Joy  2000000000       1000            Siba Media
19         Spider-Man: No Way Home  1921847111  200000000                Disney
14                  Jurassic World  1671537444  150000000             Universal
89                   The Lion King  1663075401  260000000                Disney

Revenue by major studios:
        

## Final Dataset Summary

In [32]:
# Feature summary
binary_features = [col for col in df.columns if col.startswith('is_')]
print(f"Dataset shape: {df.shape}")
print(f"Binary features created: {len(binary_features)}")

print("\nFeature summary:")
for feature in binary_features:
    count = df[feature].sum()
    if count > 50:
        print(f"{feature}: {count} ({count/len(df)*100:.1f}%)")

# Save processed dataset
df.to_csv("../data/dataset_processed.csv", index=False)
print(f"\nProcessed dataset saved with {df.shape[1]} features")

Dataset shape: (2352, 63)
Binary features created: 25

Feature summary:
is_disney: 71 (3.0%)
is_warner_bros: 123 (5.2%)
is_universal: 155 (6.6%)
is_sony: 110 (4.7%)
is_paramount: 72 (3.1%)
is_fox: 67 (2.8%)
is_lionsgate: 91 (3.9%)
is_major_studio: 676 (28.7%)
is_action_adventure: 210 (8.9%)
is_action_sci_fi: 144 (6.1%)
is_comedy_romance: 84 (3.6%)
is_family_animation: 106 (4.5%)
is_usa: 1515 (64.4%)
is_uk: 351 (14.9%)
is_us_uk: 1669 (71.0%)
is_english: 2352 (100.0%)
is_multilingual: 696 (29.6%)
is_sequel_title: 348 (14.8%)
is_sequel: 370 (15.7%)

Processed dataset saved with 63 features


In [None]:
# Filter for A24 movies (anywhere in production companies vs primary studio only)
a24_any = df[df['production_companies'].str.contains('A24', case=False, na=False)]
a24_primary = df[df['primary_studio'] == 'A24']

print(f"A24 movies (anywhere in production companies): {len(a24_any)}")
print(f"A24 movies (primary studio only): {len(a24_primary)}")
print(f"Difference: {len(a24_any) - len(a24_primary)}")

print(f"\nAll A24 movies:")
print(a24_any[['title', 'release_date', 'revenue', 'budget', 'primary_studio', 'production_companies']].to_string(index=False))

print(f"\nA24 movies where A24 is NOT the primary studio:")
not_primary = a24_any[a24_any['primary_studio'] != 'A24']
print(not_primary[['title', 'primary_studio', 'production_companies']].to_string(index=False))