In [1]:
import pandas as pd

df = pd.read_parquet('movie_data.parquet')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99806 entries, 0 to 99805
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   userId             99806 non-null  int64  
 1   rating             99806 non-null  float64
 2   imdb_id            99806 non-null  object 
 3   adult              99806 non-null  object 
 4   genres             99806 non-null  object 
 5   original_language  99806 non-null  object 
 6   overview           99792 non-null  object 
 7   popularity         99806 non-null  float64
 8   poster_path        99800 non-null  object 
 9   release_date       99800 non-null  object 
 10  runtime            99806 non-null  float64
 11  title              99806 non-null  object 
 12  vote_average       99806 non-null  float64
 13  vote_count         99806 non-null  float64
dtypes: float64(5), int64(1), object(8)
memory usage: 10.7+ MB


In [3]:
ratings = df[['userId','title','rating','vote_average','vote_count', 'popularity']]
ratings.head()

Unnamed: 0,userId,title,rating,vote_average,vote_count,popularity
0,7,Toy Story,3.0,7.7,5415.0,21.946943
1,9,Toy Story,4.0,7.7,5415.0,21.946943
2,13,Toy Story,5.0,7.7,5415.0,21.946943
3,15,Toy Story,2.0,7.7,5415.0,21.946943
4,19,Toy Story,3.0,7.7,5415.0,21.946943


In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

user_enc = LabelEncoder()
itemr_enc = LabelEncoder()

scaler = StandardScaler()

ratings['user'] = user_enc.fit_transform(ratings['userId'])
ratings['title_enc'] = itemr_enc.fit_transform(ratings['title'])

ratings[['vote_avg', 'count']] = scaler.fit_transform(ratings[['vote_average', 'vote_count']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['user'] = user_enc.fit_transform(ratings['userId'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['title_enc'] = itemr_enc.fit_transform(ratings['title'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings[['vote_avg', 'count']] = scaler.fit_transform(ratings[['vote_average', '

In [5]:
ratings = ratings[['user','title','vote_avg','count', 'title_enc','rating']]
ratings.head()

Unnamed: 0,user,title,vote_avg,count,title_enc,rating
0,6,Toy Story,0.983228,1.869009,8116,3.0
1,8,Toy Story,0.983228,1.869009,8116,4.0
2,12,Toy Story,0.983228,1.869009,8116,5.0
3,14,Toy Story,0.983228,1.869009,8116,2.0
4,18,Toy Story,0.983228,1.869009,8116,3.0


In [6]:
from torch.utils.data import Dataset
import torch

class RatingsDataset(Dataset):
    def __init__(self, df):

        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['title_enc'].values, dtype=torch.long)
        self.vote_avg = torch.tensor(df['vote_avg'].values, dtype=torch.float32)
        self.count = torch.tensor(df['count'].values, dtype=torch.float32)
        self.y = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return {
            'users': self.users[idx],
            'items': self.items[idx],
            'vote_avg': self.vote_avg[idx],
            'count': self.count[idx],
            'y': self.y[idx]
        }

In [7]:
import torch.nn as nn

class RecSysModel(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim=50):
        super().__init__()

        # Embeddings for users and items
        # Represent each user/item by a vector of size 50
        self.user_emb = nn.Embedding(n_users, embedding_dim)
        self.item_emb = nn.Embedding(n_items, embedding_dim)

        # vote_avg and count as input
        self.extra_features = nn.Linear(2, 32)

        # Final layers
        self.fc1 = nn.Linear(embedding_dim*2 + 32, 128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, user_ids, item_ids, vote_avg, count):
        user_vec = self.user_emb(user_ids)
        item_vec = self.item_emb(item_ids)

        extra = torch.cat([vote_avg.unsqueeze(1), count.unsqueeze(1)], dim=1)
        extra_out = self.relu(self.extra_features(extra))


        x = torch.cat([user_vec, item_vec, extra_out], dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        rating = self.fc3(x)

        return rating.squeeze()

In [8]:
n_users = ratings['user'].nunique()
n_items = ratings['title_enc'].nunique()

model = RecSysModel(n_users, n_items)

In [9]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
from torch.utils.data import random_split
from torch.utils.data import DataLoader

dataset = RatingsDataset(ratings)

# Split into train/test
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 10
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:

        users = batch['users'].to(device)
        items = batch['items'].to(device)
        vote_avg = batch['vote_avg'].to(device)
        count = batch['count'].to(device)

        ratings = batch['y'].to(device).float()

        optimizer.zero_grad()
        preds = model(users, items, vote_avg, count)

        loss = criterion(preds, ratings)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    avg_train_loss = train_loss / len(train_loader)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:

            users = batch['users'].to(device)
            items = batch['items'].to(device)
            vote_avg = batch['vote_avg'].to(device)
            count = batch['count'].to(device)
            ratings = batch['y'].to(device).float()

            preds = model(users, items, vote_avg, count)
            loss = criterion(preds, ratings)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1} Train Loss: {avg_train_loss:.4f} Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_model.pth")
        print("Saved best model.")

Epoch 1 Train Loss: 1.4996 Val Loss: 0.8410
Saved best model.
Epoch 2 Train Loss: 0.8034 Val Loss: 0.8016
Saved best model.
Epoch 3 Train Loss: 0.7572 Val Loss: 0.7944
Saved best model.
Epoch 4 Train Loss: 0.7305 Val Loss: 0.7887
Saved best model.
Epoch 5 Train Loss: 0.7075 Val Loss: 0.7913
Epoch 6 Train Loss: 0.6869 Val Loss: 0.7944
Epoch 7 Train Loss: 0.6682 Val Loss: 0.7873
Saved best model.
Epoch 8 Train Loss: 0.6480 Val Loss: 0.7976
Epoch 9 Train Loss: 0.6302 Val Loss: 0.8074
Epoch 10 Train Loss: 0.6079 Val Loss: 0.8072


In [12]:
import pandas as pd
def suggest_movies_for_user(user_id, model, df, item_encoder, top_k=10):

  model.eval()
  with torch.no_grad():
    # Get movies the user hasn't rated
    user_rated_movies = df[df['userId'] == user_id]['title'].unique()
    all_movies = df['title'].unique()
    unrated_movies = [movie for movie in all_movies if movie not in user_rated_movies]

    # Create a DataFrame of unrated movies with relevant features
    unrated_df = df[df['title'].isin(unrated_movies)].drop_duplicates(subset=['title']).copy()

    # Apply the same scaling to 'vote_average' and 'vote_count' as used for training
    unrated_df[['vote_avg', 'count']] = scaler.transform(unrated_df[['vote_average', 'vote_count']])
    unrated_df['titles'] = itemr_enc.transform(unrated_df['title'])

    # Create tensors for prediction
    user_ids_tensor = torch.tensor([user_enc.transform([user_id])[0]] * len(unrated_df), dtype=torch.long).to(device)
    item_ids_tensor = torch.tensor(unrated_df['titles'].values, dtype=torch.long).to(device)
    vote_avg_tensor = torch.tensor(unrated_df['vote_avg'].values, dtype=torch.float32).to(device)
    count_tensor = torch.tensor(unrated_df['count'].values, dtype=torch.float32).to(device)

    # Predict ratings
    predicted_ratings = model(user_ids_tensor, item_ids_tensor, vote_avg_tensor, count_tensor).cpu().numpy()
    unrated_df['predicted_rating'] = predicted_ratings

    # Filter by vote count and vote avg.
    unrated_df = unrated_df[(unrated_df['vote_count'] > 200) & (unrated_df['vote_average'] > 5)]
    suggestions = unrated_df.sort_values(by='predicted_rating', ascending=False).head(top_k)

    return suggestions[['title', 'predicted_rating', 'vote_average', 'vote_count', 'popularity']]

# Test with a random user-id
suggested_movies = suggest_movies_for_user(120, model, df, itemr_enc, top_k=10)
suggested_movies


Unnamed: 0,title,predicted_rating,vote_average,vote_count,popularity
95974,The Help,4.673569,7.9,1966.0,7.486118
90759,The Man from Earth,4.502898,7.7,696.0,9.29906
99688,Creed,4.487488,7.3,1963.0,33.449723
93565,Cloudy with a Chance of Meatballs,4.45422,6.5,1799.0,11.665693
94068,The Chaser,4.396053,7.7,250.0,7.062323
99760,Sing Street,4.393985,8.0,669.0,10.672862
26316,Reservoir Dogs,4.370923,8.1,3821.0,12.22034
95113,Harry Potter and the Deathly Hallows: Part 1,4.368943,7.5,5708.0,23.300362
92848,Inglourious Basterds,4.367455,7.9,6598.0,16.89564
89443,Ratatouille,4.354905,7.5,4510.0,20.50803
