<a href="https://colab.research.google.com/github/kimhkh/movieRecommendationSystem/blob/Kam_branch/movie_recommendation_system_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install surprise

In [None]:
pip install pytorch_lightning


In [None]:
from surprise import SVD, accuracy, SVDpp
from surprise.model_selection import cross_validate
import pandas as pd
import numpy as np
import os
from surprise import Reader, Dataset
import seaborn as sns
import matplotlib.pyplot as plt
import time
from surprise.model_selection import train_test_split
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset as d
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from datetime import datetime

In [None]:
df_rating = pd.read_csv('/content/drive/MyDrive/movielens1B/ratings.csv')

In [None]:
np.random.seed(123)
rand_userIds = np.random.choice(df_rating['userId'].unique(), 
                                size=int(len(df_rating['userId'].unique())*0.2), 
                                replace=False)

df_rating = df_rating.loc[df_rating['userId'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(df_rating), len(df_rating['userId'].unique())))
df_rating["timestamp"] = df_rating.timestamp.apply(lambda x: datetime.fromtimestamp(x / 1e3))

In [None]:
df_rating.head()

In [None]:
#Loading the df_rating from a pandas dataframe using load_from_df() method and reader object

reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(df_rating[['userId', 'movieId', 'rating']], reader)
trainset, testset = trainset, testset = train_test_split(data, test_size=0.20)

In [None]:
#Using the SVD algorithm
import time
start = time.time()
algo = SVD()
#Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)
end = time.time()
time = end-start
print(time)

In [None]:
import time
start1 = time.time()
cross_validate(algo, data, measures = ['RMSE'],cv=5,verbose=True) 
end1 = time.time()
time = end1-start1
print(time)

In [None]:
import time
start = time.time()
algo= SVDpp()
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)
end = time.time()
time = end-start
print(time)

In [None]:
import time
start1 = time.time()
cross_validate(algo, data, measures = ['RMSE'],cv=5,verbose=True) 
end1 = time.time()
time = end1-start1
print(time)

In [None]:
df_rating['rank_latest'] = df_rating.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)
df_rating

In [None]:
#Using earlier review for trainning, and using latest review for testing
train_ratings = df_rating[df_rating['rank_latest'] != 1]
test_ratings = df_rating[df_rating['rank_latest'] == 1]
train_ratings

In [None]:
#Converting the dataset into an implicit feedback dataset
#Binarize the ratings to 1 means the user has interacted with the movie
train_ratings.loc[:, 'rating'] = 1
train_ratings.sample(10)

In [None]:
# Generate negative samples to train our models
# Get a list of all movie IDs
all_movieIds = df_rating['movieId'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

In [None]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [None]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=1)

In [None]:
num_users = df_rating['userId'].max()+1
num_items = df_rating['movieId'].max()+1

all_movieIds = df_rating['movieId'].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)

In [None]:
import time
start = time.time()
trainer = pl.Trainer(max_epochs=3, gpus=1, reload_dataloaders_every_epoch=True,
                     progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)
end = time.time()
time = end-start

In [None]:
print(time)

In [None]:
# User-item pairs for testing

test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = df_rating.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The accuracy of the recommended top 10 items that the user will interact is {:.2f}".format(np.average(hits)))

In [None]:
df = df_rating.groupby('userId')['movieId'].apply(list).to_dict()
