In [8]:
import os

os.listdir('/content/drive/MyDrive/Dataset')

['links_small.csv',
 'keywords.csv',
 'links.csv',
 'credits.csv',
 'movies_metadata.csv',
 'ratings_small.csv',
 'ratings.csv']

In [9]:
import pandas as pd

file_path = '/content/drive/MyDrive/Dataset/movies_metadata.csv'

df = pd.read_csv(file_path, low_memory=False)

df.head()


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [10]:
# Convert budget and revenue to numeric
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['revenue'] = pd.to_numeric(df['revenue'], errors='coerce')

# Convert release date and extract year
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['year'] = df['release_date'].dt.year

# Drop missing or invalid rows
df = df.dropna(subset=['budget', 'revenue', 'year'])
df = df[(df['budget'] > 0) & (df['revenue'] > 0)]

df[['title', 'budget', 'revenue', 'year']].head()


Unnamed: 0,title,budget,revenue,year
0,Toy Story,30000000.0,373554033.0,1995.0
1,Jumanji,65000000.0,262797249.0,1995.0
3,Waiting to Exhale,16000000.0,81452156.0,1995.0
5,Heat,60000000.0,187436818.0,1995.0
8,Sudden Death,35000000.0,64350171.0,1995.0


In [11]:
# Correlation
df[['budget', 'revenue']].corr()


Unnamed: 0,budget,revenue
budget,1.0,0.730271
revenue,0.730271,1.0


In [12]:
# Top 10 highest revenue movies
df[['title', 'revenue']].sort_values(by='revenue', ascending=False).head(10)


Unnamed: 0,title,revenue
14551,Avatar,2787965000.0
26555,Star Wars: The Force Awakens,2068224000.0
1639,Titanic,1845034000.0
17818,The Avengers,1519558000.0
25084,Jurassic World,1513529000.0
28830,Furious 7,1506249000.0
26558,Avengers: Age of Ultron,1405404000.0
17437,Harry Potter and the Deathly Hallows: Part 2,1342000000.0
22110,Frozen,1274219000.0
42222,Beauty and the Beast,1262886000.0


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

X = df[['budget', 'popularity', 'vote_count', 'year']]
y = df['revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R² Score: 0.6838769631508103
RMSE: 77857879.39065664


In [14]:
import plotly.express as px

fig = px.scatter(
    df,
    x='budget',
    y='revenue',
    hover_name='title',
    title='Budget vs Revenue of Movies',
    labels={'budget': 'Budget (USD)', 'revenue': 'Revenue (USD)'}
)

fig.show()


In [15]:
import numpy as np

df['log_budget'] = np.log1p(df['budget'])
df['log_revenue'] = np.log1p(df['revenue'])

fig = px.scatter(
    df,
    x='log_budget',
    y='log_revenue',
    title='Log Budget vs Log Revenue',
    labels={'log_budget': 'Log Budget', 'log_revenue': 'Log Revenue'}
)

fig.show()


In [16]:
fig = px.histogram(
    df,
    x='revenue',
    nbins=50,
    title='Distribution of Movie Revenue'
)

fig.show()


In [17]:
movies_per_year = df.groupby('year').size().reset_index(name='count')

fig = px.line(
    movies_per_year,
    x='year',
    y='count',
    title='Number of Movies Released per Year'
)

fig.show()


In [18]:
top10 = df[['title', 'revenue']].sort_values(
    by='revenue', ascending=False
).head(10)

fig = px.bar(
    top10,
    x='title',
    y='revenue',
    title='Top 10 Highest Grossing Movies (Box Office)',
    labels={'revenue': 'Revenue (USD)', 'title': 'Movie'}
)
fig.show()
