In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [21]:
df = pd.read_csv("movie_metadata.csv")
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [None]:
# ------------------------
# 1. Drop columns we don't need
# ------------------------
drop_cols = [
    'movie_title', 'plot_keywords', 'movie_imdb_link', 'color'
]
df = df.drop(columns=drop_cols)

# ------------------------
# 2. Fill missing values
# ------------------------
# Numeric columns
numeric_cols = [
    'budget', 'title_year', 'duration', 'num_critic_for_reviews',
    'num_user_for_reviews', 'facenumber_in_poster'
]
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical columns
cat_cols = ['genres', 'director_name', 'actor_1_name', 'actor_2_name',
            'actor_3_name', 'content_rating', 'country', 'language']
for col in cat_cols:
    df[col] = df[col].fillna('Unknown')

# ------------------------
# 3. Process multi-label 'genres'
# ------------------------
all_genres = set()
df['genres'] = df['genres'].apply(lambda x: x.split('|'))
for genre_list in df['genres']:
    all_genres.update(genre_list)
all_genres = sorted(list(all_genres))

# Multi-hot encoding
for genre in all_genres:
    df[f'genre_{genre}'] = df['genres'].apply(lambda x: 1 if genre in x else 0)
df = df.drop(columns=['genres'])

# ------------------------
# 4. One-hot encode small categorical features
# ------------------------
one_hot_cols = ['content_rating', 'country', 'language']
df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

# ------------------------
# 5. Encode top directors and actors (top-10 only)
# ------------------------
top_directors = df['director_name'].value_counts().nlargest(10).index
for director in top_directors:
    df[f'director_{director}'] = df['director_name'].apply(lambda x: 1 if x==director else 0)
df = df.drop(columns=['director_name'])

top_actors = []
for col in ['actor_1_name', 'actor_2_name', 'actor_3_name']:
    top_actors += list(df[col].value_counts().nlargest(10).index)
top_actors = list(set(top_actors))
for actor in top_actors:
    for col in ['actor_1_name', 'actor_2_name', 'actor_3_name']:
        df[f'actor_{actor}'] = df[col].apply(lambda x: 1 if x==actor else 0)
df = df.drop(columns=['actor_1_name','actor_2_name','actor_3_name'])

# ------------------------
# 6. Separate features and multi-target output
# ------------------------
X = df.drop(columns=['imdb_score','gross'])
y = df[['imdb_score','gross']].values  # Multi-target

# ------------------------
# 7. Scale numeric features
# ------------------------
scaler_X = StandardScaler()
X[numeric_cols[:-1]] = scaler_X.fit_transform(X[numeric_cols[:-1]])  # exclude gross from X

scaler_y = StandardScaler()
y = scaler_y.fit_transform(y)  # scale both imdb_score and gross

print("Preprocessing complete!")
print(f"Input shape: {X.shape}, Output shape: {y.shape}")


Preprocessing complete!
Input shape: (3755, 161), Output shape: (3755, 2)


In [25]:
X

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,...,actor_Kevin Dunn,actor_Robert De Niro,actor_J.K. Simmons,actor_Craig T. Nelson,actor_Nicolas Cage,actor_Robert Duvall,actor_Scott Glenn,actor_Denzel Washington,actor_Adam Sandler,actor_Kirsten Dunst
0,4.500704,2.991207,0.0,855.0,1000.0,886204,4834,-0.674840,6.607632,0.844000,...,0,0,0,0,0,0,0,0,0,0
1,1.090391,2.593791,563.0,1000.0,40000.0,471220,48350,-0.674840,2.191334,1.122750,...,0,0,0,0,0,0,0,0,0,0
2,3.520542,1.666488,0.0,161.0,11000.0,275868,11700,-0.184984,1.597954,0.879397,...,0,0,0,0,0,0,0,0,0,0
3,5.229749,2.373005,22000.0,23000.0,27000.0,1144337,106759,-0.674840,5.749177,0.901520,...,0,0,0,0,0,0,0,0,0,0
5,2.386472,0.959971,475.0,530.0,640.0,212204,1873,-0.184984,0.975393,0.962137,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,-0.699821,-0.011489,107.0,45.0,576.0,3924,776,-0.184984,-0.724493,-0.204609,...,0,0,0,0,0,0,0,0,0,0
5027,-0.837530,-0.894635,397.0,0.0,5.0,4555,5,-0.674840,-0.756107,-0.204585,...,0,0,0,0,0,0,0,0,0,0
5033,-0.197590,-1.468680,291.0,8.0,291.0,72639,368,-0.674840,0.082892,-0.204598,...,0,0,0,0,0,0,0,0,0,0
5035,-0.902334,-1.292051,0.0,6.0,121.0,52055,147,-0.674840,-0.503192,-0.204598,...,0,0,0,0,0,0,0,0,0,0


In [26]:
y

array([[ 1.35950199, 10.06753235],
       [ 0.60172859,  3.65192122],
       [ 0.31756356,  2.09701991],
       ...,
       [ 0.50700691, -0.74241302],
       [ 0.41228524, -0.71942784],
       [ 0.12812021, -0.74724196]])