In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load data
movies = pd.read_csv('../data/movies_metadata.csv', low_memory=False)
ratings = pd.read_csv('../data/ratings.csv')

In [3]:
# Clean movie dataset
movies = movies[['id', 'title', 'budget', 'revenue', 'genres', 'vote_average', 'vote_count', 'release_date']]
movies = movies.dropna(subset=['id', 'budget', 'revenue'])

In [4]:
# Convert to numeric
movies['budget'] = pd.to_numeric(movies['budget'], errors='coerce')
movies['revenue'] = pd.to_numeric(movies['revenue'], errors='coerce')
movies = movies.dropna(subset=['budget', 'revenue'])

In [5]:
# Filter out unrealistic values
movies = movies[(movies['budget'] > 0) & (movies['revenue'] > 0)]

In [6]:
# Merge with ratings (aggregate)
ratings_grouped = ratings.groupby('movieId')['rating'].mean().reset_index()
ratings_grouped.columns = ['movieId', 'avg_rating']

In [7]:
# Fix ID types for join
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
merged = pd.merge(movies, ratings_grouped, left_on='id', right_on='movieId', how='left')

merged.to_csv('../data/merged_cleaned_movies.csv', index=False)