In [3]:
import numpy as np 
import pandas as pd

In [4]:
df =  pd.read_csv('/kaggle/input/processed-movie-data/processed.csv') 
df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7657,Aloha Surf Hotel,Unknown Rating,Comedy,2020,"November 5, 2020 (United States)",7.1,14.0,Stefan C. Schaefer,Stefan C. Schaefer,Augie Tulba,United States,20500000.0,20205757.0,Abominable Pictures,90.0
7658,More to Life,Unknown Rating,Drama,2020,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,United States,7000.0,20205757.0,Unknown Company,90.0
7659,Dream Round,Unknown Rating,Comedy,2020,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,United States,20500000.0,20205757.0,Cactus Blue Entertainment,90.0
7660,Saving Mbango,Unknown Rating,Drama,2020,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,United States,58750.0,20205757.0,Embi Productions,104.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7662 entries, 0 to 7661
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7662 non-null   object 
 1   rating    7662 non-null   object 
 2   genre     7662 non-null   object 
 3   year      7662 non-null   int64  
 4   released  7662 non-null   object 
 5   score     7662 non-null   float64
 6   votes     7662 non-null   float64
 7   director  7662 non-null   object 
 8   writer    7662 non-null   object 
 9   star      7662 non-null   object 
 10  country   7662 non-null   object 
 11  budget    7662 non-null   float64
 12  gross     7662 non-null   float64
 13  company   7662 non-null   object 
 14  runtime   7662 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.0+ KB


In [6]:
df['profit_ratio'] = df['gross'] / df['budget'].replace(0, np.nan)
def categorize_performance(r):
    if pd.isna(r):
        return "Unknown Performance" #budget == 0 case 
    if r < 1: return "Flop"
    elif r < 1.5: return "Average"
    elif r < 3: return "Hit"
    elif r < 5: return "Super Hit"
    elif r < 8: return "Blockbuster"
    else: return "All-Time Blockbuster"

df['performance_class'] = df['profit_ratio'].apply(categorize_performance)
df = df[df['performance_class'] != "Unknown Performance"]
df['performance_class'].count()

np.int64(7662)

In [7]:
def score_category(x):
    if x >= 8:
        return "Excellent"
    elif x >= 7:
        return "Very Good"
    elif x >= 6:
        return "Good"
    elif x >= 5:
        return "Average"
    elif x > 0:
        return "Poor"
    else:
        return "MissingScore"     # missing or 0 score
df["score_cat"] = df["score"].apply(score_category)


df["score_cat"].head()

0    Excellent
1      Average
2    Excellent
3    Very Good
4    Very Good
Name: score_cat, dtype: object

In [8]:
def classify_budget(b):  #us standards
    if b == 0: return "unknown Budget"
    elif b < 5_000_000: return "Low Budget"
    elif b < 50_000_000: return "Mid Budget"
    else: return "High Budget"

df["budget_cat"] = df["budget"].apply(classify_budget)

df["budget_cat"] 

0       Mid Budget
1       Low Budget
2       Mid Budget
3       Low Budget
4       Mid Budget
           ...    
7657    Mid Budget
7658    Low Budget
7659    Mid Budget
7660    Low Budget
7661    Mid Budget
Name: budget_cat, Length: 7662, dtype: object

In [9]:
perf_order = ["All-Time Blockbuster", "Blockbuster", "Super Hit", "Hit", "Average", "Flop", "Unknown"]

dir_counts = df.groupby(['director', 'performance_class']).size().reset_index(name='count')
dir_pivot = dir_counts.pivot(index='director', columns='performance_class', values='count').fillna(0)

actor_counts = df.groupby(['star', 'performance_class']).size().reset_index(name='count')
actor_pivot = actor_counts.pivot(index='star', columns='performance_class', values='count').fillna(0)

for c in perf_order:
    if c not in dir_pivot.columns:
        dir_pivot[c] = 0
    if c not in actor_pivot.columns:
        actor_pivot[c] = 0

weights = {
    "All-Time Blockbuster": 15,
    "Blockbuster": 5,
    "Super Hit": 3,
    "Hit": 2,
    "Average": 1,
    "Flop": 0,
    "Unknown": 0
}

def compute_weighted_score(row):
    return sum(row[k] * w for k, w in weights.items())


dir_pivot['director_success_score'] = dir_pivot.apply(compute_weighted_score, axis=1)
actor_pivot['actor_success_score'] = actor_pivot.apply(compute_weighted_score, axis=1)

# # map back
df['director_success_score'] = df['director'].map(dir_pivot['director_success_score']).fillna(0)
df['actor_success_score'] = df['star'].map(actor_pivot['actor_success_score']).fillna(0)

actor_pivot.sort_values(by='actor_success_score',ascending=False).head(10)


performance_class,All-Time Blockbuster,Average,Blockbuster,Flop,Hit,Super Hit,Unknown,actor_success_score
star,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Tom Hanks,5.0,2.0,10.0,5.0,10.0,9.0,0,174.0
Tom Cruise,4.0,1.0,4.0,2.0,12.0,11.0,0,138.0
Harrison Ford,5.0,3.0,1.0,7.0,6.0,4.0,0,107.0
Mel Gibson,3.0,3.0,6.0,6.0,8.0,3.0,0,103.0
Arnold Schwarzenegger,3.0,4.0,3.0,4.0,10.0,3.0,0,93.0
Robin Williams,4.0,2.0,1.0,11.0,7.0,2.0,0,87.0
Kevin Costner,3.0,5.0,2.0,8.0,7.0,4.0,0,86.0
Sylvester Stallone,2.0,3.0,3.0,9.0,9.0,6.0,0,84.0
Jim Carrey,2.0,0.0,4.0,2.0,8.0,6.0,0,84.0
Daniel Radcliffe,4.0,0.0,4.0,4.0,0.0,1.0,0,83.0


In [10]:
def categorize_director(score):
    if score >= 30:
        return "Legendary-Director"
    elif score >= 15:
        return "High-Performer-Director"
    elif score >= 5:
        return "Medium-Director"
    else:
        return "Low-Director"

df['director_category'] = df['director_success_score'].apply(categorize_director)

In [11]:
def categorize_actor(score):
    if score >= 40:
        return "Legendary-Actor"
    elif score >= 20:
        return "High-Performer-Actor"
    elif score >= 8:
        return "Mid-Actor"
    else:
        return "Low-Actor"
df['actor_category'] = df['actor_success_score'].apply(categorize_actor)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7662 entries, 0 to 7661
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    7662 non-null   object 
 1   rating                  7662 non-null   object 
 2   genre                   7662 non-null   object 
 3   year                    7662 non-null   int64  
 4   released                7662 non-null   object 
 5   score                   7662 non-null   float64
 6   votes                   7662 non-null   float64
 7   director                7662 non-null   object 
 8   writer                  7662 non-null   object 
 9   star                    7662 non-null   object 
 10  country                 7662 non-null   object 
 11  budget                  7662 non-null   float64
 12  gross                   7662 non-null   float64
 13  company                 7662 non-null   object 
 14  runtime                 7662 non-null   

In [13]:
df.to_csv("processed-movie.csv", index=False)
df


Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,...,company,runtime,profit_ratio,performance_class,score_cat,budget_cat,director_success_score,actor_success_score,director_category,actor_category
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,...,Warner Bros.,146.0,2.473620,Hit,Excellent,Mid Budget,6.0,25.0,Medium-Director,High-Performer-Actor
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,...,Columbia Pictures,104.0,13.078468,All-Time Blockbuster,Average,Low Budget,20.0,17.0,High-Performer-Director,Mid-Actor
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,...,Lucasfilm,124.0,29.909726,All-Time Blockbuster,Excellent,Mid Budget,19.0,30.0,High-Performer-Director,High-Performer-Actor
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,...,Paramount Pictures,88.0,23.843868,All-Time Blockbuster,Very Good,Low Budget,36.0,22.0,Legendary-Director,High-Performer-Actor
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,...,Orion Pictures,98.0,6.641057,Blockbuster,Very Good,Mid Budget,16.0,34.0,High-Performer-Director,High-Performer-Actor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7657,Aloha Surf Hotel,Unknown Rating,Comedy,2020,"November 5, 2020 (United States)",7.1,14.0,Stefan C. Schaefer,Stefan C. Schaefer,Augie Tulba,...,Abominable Pictures,90.0,0.985647,Flop,Very Good,Mid Budget,0.0,0.0,Low-Director,Low-Actor
7658,More to Life,Unknown Rating,Drama,2020,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,...,Unknown Company,90.0,2886.536714,All-Time Blockbuster,Poor,Low Budget,15.0,15.0,High-Performer-Director,Mid-Actor
7659,Dream Round,Unknown Rating,Comedy,2020,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,...,Cactus Blue Entertainment,90.0,0.985647,Flop,Poor,Mid Budget,0.0,0.0,Low-Director,Low-Actor
7660,Saving Mbango,Unknown Rating,Drama,2020,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,...,Embi Productions,104.0,343.927779,All-Time Blockbuster,Average,Low Budget,15.0,15.0,High-Performer-Director,Mid-Actor
