In [1]:
import pandas as pd

df = pd.read_parquet('movie_data.parquet')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99806 entries, 0 to 99805
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   userId             99806 non-null  int64  
 1   rating             99806 non-null  float64
 2   imdb_id            99806 non-null  object 
 3   adult              99806 non-null  object 
 4   genres             99806 non-null  object 
 5   original_language  99806 non-null  object 
 6   overview           99792 non-null  object 
 7   popularity         99806 non-null  float64
 8   poster_path        99800 non-null  object 
 9   release_date       99800 non-null  object 
 10  runtime            99806 non-null  float64
 11  title              99806 non-null  object 
 12  vote_average       99806 non-null  float64
 13  vote_count         99806 non-null  float64
dtypes: float64(5), int64(1), object(8)
memory usage: 10.7+ MB


In [41]:
df.rating.value_counts().sort_index(ascending=False)

rating
5.0    15051
4.5     7701
4.0    28702
3.5    10509
3.0    20039
2.5     4443
2.0     7259
1.5     1685
1.0     3321
0.5     1096
Name: count, dtype: int64

In [4]:
# Extract genere
import ast

genres = ast.literal_eval(df['genres'][0])
genres_name = [genre['name'] for genre in genres]

def extract_genre_names(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        return [g['name'] for g in genres]
    except (ValueError, SyntaxError):
        return ['None']


df['genres'] = df['genres'].apply(extract_genre_names)
df[['title', 'genres']].head()

# Combine overview and genre
df['genres_str'] = df['genres'].apply(lambda x: '|'.join(x))


In [5]:
all_genres = df['genres_str'].str.split('|').explode()
genre_list = all_genres.unique().tolist()
genre_list = [g for g in genre_list if g != '']
genre_list

['Animation',
 'Comedy',
 'Family',
 'Adventure',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'History',
 'Science Fiction',
 'Mystery',
 'War',
 'Foreign',
 'Music',
 'Documentary',
 'Western',
 'TV Movie']

In [6]:
genre2idx = {genre: idx for idx, genre in enumerate(genre_list)}

def multi_hot_encode(genres_str, genre2idx):
    vector = [0] * len(genre2idx)
    for genre in genres_str.split('|'):
        if genre in genre2idx:
            vector[genre2idx[genre]] = 1
    return vector

df['genre_vector'] = df['genres_str'].apply(lambda x: multi_hot_encode(x, genre2idx))


In [7]:
df['genres_str'].value_counts()

Unnamed: 0_level_0,count
genres_str,Unnamed: 1_level_1
Drama,6122
Comedy,5070
Comedy|Drama|Romance,3212
Comedy|Drama,2917
Drama|Romance,2729
...,...
History|Romance|Drama,1
TV Movie|Comedy|Family|Fantasy,1
Animation|Family|Fantasy|Music,1
Thriller|Action,1


In [8]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

user_enc = LabelEncoder()
itemr_enc = LabelEncoder()
scaler = StandardScaler()

df['user'] = user_enc.fit_transform(df['userId'])
df['title_enc'] = itemr_enc.fit_transform(df['title'])

df[['vote_avg', 'count','popularity_scaled']] = scaler.fit_transform(df[['vote_average', 'vote_count','popularity']])


In [9]:
from torch.utils.data import Dataset
import torch

class RatingsDataset(Dataset):
    def __init__(self, df):

        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['title_enc'].values, dtype=torch.long)
        self.vote_avg = torch.tensor(df['vote_avg'].values, dtype=torch.float32)
        self.count = torch.tensor(df['count'].values, dtype=torch.float32)
        self.popularity = torch.tensor(df['popularity_scaled'].values, dtype=torch.float32)
        self.genre = torch.tensor(df['genre_vector'].tolist(), dtype=torch.float32)
        self.y = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return {
            'users': self.users[idx],
            'items': self.items[idx],
            'vote_avg': self.vote_avg[idx],
            'count': self.count[idx],
            'popularity': self.popularity[idx],
            'genre': self.genre[idx],
            'y': self.y[idx]
        }

In [10]:
from torch.utils.data import random_split
from torch.utils.data import DataLoader

dataset = RatingsDataset(df)

# Split into train/test
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

In [11]:
import torch.nn as nn

class RecSysModel(nn.Module):
    def __init__(self, n_users, n_items, n_genres, embedding_dim=32):
        super(RecSysModel, self).__init__()

        # Embedding layers
        self.user_embed = nn.Embedding(n_users, embedding_dim)
        self.item_embed = nn.Embedding(n_items, embedding_dim)

        # Numerical features (vote_avg, count, popularity)
        self.extra_features = nn.Sequential(
            nn.Linear(3, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # Genre multi-hot features
        self.genre_features = nn.Sequential(
            nn.Linear(n_genres, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # Final MLP
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2 + 32 + 32, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(64, 1),
        )

    def forward(self, user, item, vote_avg, count, popularity, genre):
        user_emb = self.user_embed(user)
        item_emb = self.item_embed(item)

        x_embed = torch.cat([user_emb, item_emb], dim=1)
        x_num = torch.cat([vote_avg.unsqueeze(1), count.unsqueeze(1), popularity.unsqueeze(1)], dim=1)
        x_num = self.extra_features(x_num)
        x_genre = self.genre_features(genre)

        x = torch.cat([x_embed, x_num, x_genre], dim=1)
        out = self.mlp(x)
        return out.squeeze(1)


In [12]:
n_users = df['user'].nunique()
n_items = df['title_enc'].nunique()
n_genres = len(genre_list)

model = RecSysModel(n_users, n_items, n_genres=n_genres)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 10
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:

        users = batch['users'].to(device)
        items = batch['items'].to(device)
        vote_avg = batch['vote_avg'].to(device)
        count = batch['count'].to(device)
        popularity = batch['popularity'].to(device)
        genre = batch['genre'].to(device)

        ratings = batch['y'].to(device).float()

        optimizer.zero_grad()
        preds = model(users, items, vote_avg, count,popularity,genre)

        loss = criterion(preds, ratings)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    avg_train_loss = train_loss / len(train_loader)

    model.eval()
    val_loss = 0
    val_preds = []
    val_truths = []

    with torch.no_grad():
        for batch in val_loader:

            users = batch['users'].to(device)
            items = batch['items'].to(device)
            vote_avg = batch['vote_avg'].to(device)
            count = batch['count'].to(device)
            ratings = batch['y'].to(device).float()
            popularity = batch['popularity'].to(device)
            genre = batch['genre'].to(device)


            preds = model(users, items, vote_avg, count, popularity, genre)
            val_preds.append(preds.detach().cpu())
            val_truths.append(ratings.detach().cpu())

            loss = criterion(preds, ratings)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1} Train Loss: {avg_train_loss:.4f} Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_model.pth")
        print("Saved best model.")


preds_tensor = torch.cat(val_preds)
truths_tensor = torch.cat(val_truths)
rmse = torch.sqrt(torch.mean((preds_tensor - truths_tensor) ** 2))
mae = torch.mean(torch.abs(preds_tensor - truths_tensor))
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")

Epoch 1 Train Loss: 1.7817 Val Loss: 1.0485
Saved best model.
Epoch 2 Train Loss: 1.0846 Val Loss: 0.9343
Saved best model.
Epoch 3 Train Loss: 0.9844 Val Loss: 0.8086
Saved best model.
Epoch 4 Train Loss: 0.9195 Val Loss: 0.8605
Epoch 5 Train Loss: 0.8798 Val Loss: 0.7830
Saved best model.
Epoch 6 Train Loss: 0.8458 Val Loss: 0.7602
Saved best model.
Epoch 7 Train Loss: 0.8148 Val Loss: 0.7524
Saved best model.
Epoch 8 Train Loss: 0.7888 Val Loss: 0.7447
Saved best model.
Epoch 9 Train Loss: 0.7608 Val Loss: 0.7598
Epoch 10 Train Loss: 0.7390 Val Loss: 0.7530
Validation RMSE: 0.8678
Validation MAE: 0.6685


In [16]:
import pandas as pd
def suggest_movies_for_user(user_id, model, df, item_encoder, top_k=10):

  model.eval()
  with torch.no_grad():
    # Get movies the user hasn't rated
    user_rated_movies = df[df['userId'] == user_id]['title'].unique()
    all_movies = df['title'].unique()
    unrated_movies = [movie for movie in all_movies if movie not in user_rated_movies]

    # Create a DataFrame of unrated movies with relevant features
    unrated_df = df[df['title'].isin(unrated_movies)].drop_duplicates(subset=['title']).copy()

    # Apply the same scaling to 'vote_average', 'vote_count', and 'popularity' as used for training
    unrated_df[['vote_avg', 'count', 'popularity_scaled']] = scaler.transform(unrated_df[['vote_average', 'vote_count', 'popularity']])
    unrated_df['title_enc'] = itemr_enc.transform(unrated_df['title'])

    # Create tensors for prediction
    user_ids_tensor = torch.tensor([user_enc.transform([user_id])[0]] * len(unrated_df), dtype=torch.long).to(device)
    item_ids_tensor = torch.tensor(unrated_df['title_enc'].values, dtype=torch.long).to(device)
    vote_avg_tensor = torch.tensor(unrated_df['vote_avg'].values, dtype=torch.float32).to(device)
    count_tensor = torch.tensor(unrated_df['count'].values, dtype=torch.float32).to(device)
    popularity_tensor = torch.tensor(unrated_df['popularity_scaled'].values, dtype=torch.float32).to(device)
    genre_tensor = torch.tensor(unrated_df['genre_vector'].tolist(), dtype=torch.float32).to(device)


    # Predict ratings
    predicted_ratings = model(user_ids_tensor, item_ids_tensor, vote_avg_tensor, count_tensor,popularity_tensor,genre_tensor).cpu().numpy()
    unrated_df['predicted_rating'] = predicted_ratings

    # Filter by vote count and vote avg.
    unrated_df = unrated_df[(unrated_df['vote_count'] > 200) & (unrated_df['vote_average'] > 5)]
    suggestions = unrated_df.sort_values(by='predicted_rating', ascending=False).head(top_k)

    return suggestions[['title', 'predicted_rating', 'vote_average', 'vote_count', 'popularity','genres_str']]

# Test with a random user-id
suggested_movies = suggest_movies_for_user(209, model, df, itemr_enc, top_k=10)
suggested_movies

Unnamed: 0,title,predicted_rating,vote_average,vote_count,popularity,genres_str
9528,The Shawshank Redemption,4.238797,8.5,8358.0,51.645403,Drama|Crime
92848,Inglourious Basterds,4.168882,7.9,6598.0,16.89564,Drama|Action|Thriller|War
22010,The Godfather,4.165902,8.5,6024.0,41.109264,Drama|Crime
7519,Star Wars,4.161028,8.1,6778.0,42.149697,Adventure|Action|Science Fiction
28495,The Princess Bride,4.145978,7.6,1518.0,15.15267,Adventure|Family|Fantasy|Comedy|Romance
53383,Run Lola Run,4.135096,7.2,672.0,7.765379,Action|Drama|Thriller
61442,The Kid,4.129302,8.0,404.0,8.168456,Comedy|Drama
97060,The Imposter,4.125059,7.4,273.0,8.970709,Documentary|Drama
24095,It's a Wonderful Life,4.115511,8.0,1103.0,15.031588,Drama|Family|Fantasy
69134,Memento,4.114201,8.1,4168.0,15.450789,Mystery|Thriller
