In [1]:
# ==============================
# Notebook: EDA & Feature Engineering
# ==============================

# Cell 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set(style="whitegrid")

In [2]:

# Cell 2: Set paths
data_raw_path = os.path.join(os.getcwd(), 'data', 'raw')
data_processed_path = os.path.join(os.getcwd(), 'data', 'processed')

# Create processed folder if not exists
os.makedirs(data_processed_path, exist_ok=True)

In [3]:
# Cell 3: Load raw data
movies = pd.read_csv(os.path.join(data_raw_path, "movies.csv"))
ratings = pd.read_csv(os.path.join(data_raw_path, "ratings.csv"))
tags = pd.read_csv(os.path.join(data_raw_path, "tags.csv"))
links = pd.read_csv(os.path.join(data_raw_path, "links.csv"))

In [4]:
# Cell 4: Quick overview
print("Movies:", movies.shape)
print("Ratings:", ratings.shape)
print("Tags:", tags.shape)
print("Links:", links.shape)

Movies: (9742, 3)
Ratings: (100836, 4)
Tags: (3683, 4)
Links: (9742, 3)


In [5]:
# Cell 5: Check for missing values
print("Movies missing values:\n", movies.isnull().sum())
print("Ratings missing values:\n", ratings.isnull().sum())
print("Tags missing values:\n", tags.isnull().sum())
print("Links missing values:\n", links.isnull().sum())

Movies missing values:
 movieId    0
title      0
genres     0
dtype: int64
Ratings missing values:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Tags missing values:
 userId       0
movieId      0
tag          0
timestamp    0
dtype: int64
Links missing values:
 movieId    0
imdbId     0
tmdbId     8
dtype: int64


In [6]:

# Cell 6: Handle missing values
ratings = ratings.dropna(subset=['userId', 'movieId', 'rating'])
tags = tags.dropna(subset=['userId', 'movieId', 'tag'])
movies = movies.dropna(subset=['movieId', 'title'])
links = links.dropna(subset=['movieId'])
links['imdbId'] = links['imdbId'].fillna('unknown')
links['tmdbId'] = links['tmdbId'].fillna(-1)

In [7]:
# Convert data types
movies['movieId'] = movies['movieId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)
ratings['userId'] = ratings['userId'].astype(int)
tags['movieId'] = tags['movieId'].astype(int)
tags['userId'] = tags['userId'].astype(int)
links['movieId'] = links['movieId'].astype(int)

ratings['rating'] = ratings['rating'].astype(float)

In [8]:
# Convert timestamp to datetime
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', errors='coerce')
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s', errors='coerce')

movies['title'] = movies['title'].astype(str)
movies['genres'] = movies['genres'].astype(str)
tags['tag'] = tags['tag'].astype(str)

In [9]:

# ==============================
# Cell 7: Feature Engineering - Movie Statistics
# ==============================
movie_stats = ratings.groupby('movieId').agg(
    movie_avg_rating=('rating', 'mean'),
    movie_rating_count=('rating', 'count')
).reset_index()

movies = movies.merge(movie_stats, on='movieId', how='left')

In [10]:
# ==============================
# Cell 8: Feature Engineering - User Statistics
# ==============================
user_stats = ratings.groupby('userId').agg(
    user_avg_rating=('rating', 'mean'),
    user_rating_count=('rating', 'count')
).reset_index()

In [11]:
# ==============================
# Cell 9: Feature Engineering - One-Hot Encode Genres
# ==============================
movies_exploded = movies.copy()
movies_exploded['genres'] = movies_exploded['genres'].str.split('|')
movies_exploded = movies_exploded.explode('genres')

genre_dummies = pd.get_dummies(movies_exploded['genres'], prefix='genre')
movies_genres = pd.concat([movies_exploded[['movieId']], genre_dummies], axis=1)
movies_genres = movies_genres.groupby('movieId').max().reset_index()

movies = movies.merge(movies_genres, on='movieId', how='left')

In [12]:
# ==============================
# Cell 10: Feature Engineering - Aggregate User Tags
# ==============================
tags_agg = tags.groupby(['userId', 'movieId'])['tag'].apply(lambda x: ' '.join(x)).reset_index()
ratings_with_tags = ratings.merge(tags_agg, on=['userId', 'movieId'], how='left')

In [13]:
# ==============================
# Cell 11: Save Processed Data
# ==============================
movies.to_csv(os.path.join(data_processed_path, 'movies_processed.csv'), index=False)
ratings.to_csv(os.path.join(data_processed_path, 'ratings_processed.csv'), index=False)
user_stats.to_csv(os.path.join(data_processed_path, 'user_stats.csv'), index=False)
ratings_with_tags.to_csv(os.path.join(data_processed_path, 'ratings_with_tags.csv'), index=False)
links.to_csv(os.path.join(data_processed_path, 'links_processed.csv'), index=False)

print("✅ All processed datasets saved to:", data_processed_path)

✅ All processed datasets saved to: C:\Users\ihebm\Desktop\projet\data\processed
