In [17]:
#Data processing

import os
import json
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from annoy import AnnoyIndex

#load the ratings_df data and show details
ratings_df = pd.read_csv("animerating.csv")
ratings_df.head(-1)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813731,73515,14345,7
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10


In [15]:

#function to hold out a fraction of data for testing. This is for train/test split. 
def holdout_liked_movies(ratings_df, holdout_frac=0.3, random_state=42):
    #get all data that has a movie rating of 5 or higher, out of 10 
    #(movies that a user would find acceptable enough to want to rate)
    liked_df = ratings_df[ratings_df["rating"] >= 5.0]
    print("Total liked ratings:", len(liked_df))

    train_list, test_list = [], []

    for user in liked_df["userId"].unique():
        user_likes = liked_df[liked_df['userId'] == user]
        if len(user_likes) >= 2:
            test_sample = user_likes.sample(frac=holdout_frac, random_state=random_state)
            train_sample = user_likes.drop(test_sample.index)
            train_list.append(train_sample)
            test_list.append(test_sample)
        else:
            #should not split if only one interaction
            train_list.append(user_likes)  

    train_liked_df = pd.concat(train_list)
    if len(test_list) > 0:
        test_liked_df = pd.concat(test_list)
    else:
        test_liked_df = pd.DataFrame()

    #counts how many times each unique movieId appears.
    #overall_counts becomes a pandas Series where The index = each unique movieId and 
    #value = how many times that movie appears in the data
    overall_counts = liked_df['movieId'].value_counts()
    
    #movies_to_fix is a list of movie id's that appear more than once across the train and test data splits combined
    movies_to_fix = []
    for movie_id, count in overall_counts.items():
    #if the movie appears more than once, add it to the list
        if count > 1:
            movies_to_fix.append(movie_id)

    #make sure that every movie in movies_to_fix exists in both train_liked_df and test_liked_df
    #this is important to ensure that the test is fairly evaluated.
    for movie in movies_to_fix:
        #ensure the presence in each split
        #.any() is a method that checks if at least one value is True in a series
        in_train = train_liked_df["movieId"] == movie
        in_test = test_liked_df['movieId'] == movie
        has_in_train = in_train.any()
        has_in_test = in_test.any()

        if not has_in_train and has_in_test:
            #movie only in test: move one instance from test to train.
            movie_sample = test_liked_df[test_liked_df["movieId"] == movie].sample(n=1, random_state=random_state)
            #use .index to identify which row in the Dataframe to drop, remember .index gets the position of the row
            test_liked_df = test_liked_df.drop(movie_sample.index)
            train_liked_df = pd.concat([train_liked_df, movie_sample])
        elif has_in_train and not has_in_test:
            #movie only in train: move one instance from train to test.
            movie_sample = train_liked_df[train_liked_df['movieId'] == movie].sample(n=1, random_state=random_state)
            #use .index to identify which row in the Dataframe to drop, remember .index gets the position of the row
            train_liked_df = train_liked_df.drop(movie_sample.index)
            test_liked_df = pd.concat([test_liked_df, movie_sample])

    return train_liked_df, test_liked_df

#get data from csv
ratings_df = pd.read_csv("animerating.csv")
#rename column for easier reading
ratings_df = ratings_df.rename(columns={"anime_id": "movieId", "user_id": "userId"})
#extract a portion for training and testing, 
#as training the entire dataset (more than 7 million) with CPU would require too much computation and time  
ratings_df = ratings_df.iloc[:int(len(ratings_df) * 0.08)]
num_unique_users = ratings_df['userId'].nunique()
num_unique_movies = ratings_df['movieId'].nunique()
print(num_unique_users)
print(num_unique_movies)
print("Total ratings after sampling:", len(ratings_df))

#holdout liked movies for train/test split
train_liked_df, test_liked_df = holdout_liked_movies(ratings_df, holdout_frac=0.3)

num_test_movies = test_liked_df['movieId'].nunique()
num_train_movies = train_liked_df['movieId'].nunique()
print(f"num_test_movies: {num_test_movies}")
print(f"num_train_movies: {num_train_movies}")


6026
7540
Total ratings after sampling: 625098
Total liked ratings: 490644
num_test_movies: 6068
num_train_movies: 6575


In [19]:
#directed graph construction
#directed edge from user -> movie with weight = rating value.
#reverse edge from movie -> user with weight = reverse_weight.
def create_directed_bipartite_graph_from_ratings(ratings_df, reverse_weight=1.0):
    graph = nx.DiGraph()
    #add user nodes.
    for user in ratings_df['userId'].unique():
        user_node = f"u_{user}"
        graph.add_node(user_node, bipartite=0, node_type="user")
    #add movie nodes.
    for movie in ratings_df["movieId"].unique():
        movie_node = f"m_{movie}"
        graph.add_node(movie_node, bipartite=1, node_type='movie')
    #add directed edges.
    for i, row in ratings_df.iterrows():
        user_node = f"u_{row['userId']}"
        movie_node = f"m_{row['movieId']}"
        rating = float(row['rating'])
        
        #add edge from user to movie with rating value
        if graph.has_edge(user_node, movie_node):
            graph[user_node][movie_node]["weight"] = 1.0
        else:
            graph.add_edge(user_node, movie_node, weight=1.0)
        
        #add reverse edge from movie to user with a fixed weight.
        if graph.has_edge(movie_node, user_node):
            graph[movie_node][user_node]['weight'] = 0.5
        else:
            graph.add_edge(movie_node, user_node, weight=1.0)
    return graph

#create annoy file from ratings
def create_node2vec_annoy_from_ratings(ratings_df, base_dir="Annoy", embed_dimensions=256, num_trees=50):
    os.makedirs(base_dir, exist_ok=True)
    #create a directed bipartite graph
    graph = create_directed_bipartite_graph_from_ratings(ratings_df)

    node2vec = Node2Vec(
    graph,
    dimensions=embed_dimensions,
    walk_length=40,     # Longer walks for richer context.
    num_walks=80,      # More walks for better sampling.
    p=0.5,
    q=10,
    weight_key="weight",
    workers=1,)

    model = node2vec.fit(window=5, min_count=1, batch_words=4)

    #extract user nodes and sort them
    user_nodes = []

    #loop through all nodes and their data in the graph
    for node, data in graph.nodes(data=True):
        #check if the node_type is 'user'
        if "node_type" in data and data["node_type"] == "user":
            user_nodes.append(node)
    
    #sort the list of user nodes
    user_nodes = sorted(user_nodes)

    annoy_index = AnnoyIndex(embed_dimensions, metric="angular")
    #construct the user_index_map and index_user_map for retrieval with ANN file
    user_index_map = {}
    index_user_map = {}

    for i, user_node in enumerate(user_nodes):
        user_vector = model.wv[user_node]
        annoy_index.add_item(i, user_vector)
        user_index_map[user_node] = i
        index_user_map[i] = int(user_node.split("_")[1])

    annoy_index.build(num_trees)
    annoy_index.save(os.path.join(base_dir, "global_users.ann"))
    map_info = {
        "user_index_map": user_index_map,
        "index_user_map": index_user_map,
        "embed_dimensions": embed_dimensions
    }
    with open(os.path.join(base_dir, "global_map.json"), "w") as f:
        json.dump(map_info, f)
    
    return annoy_index, user_index_map, index_user_map, model
    
#train embeddings using the training ratings with the directed graph.
ann_index, user_index_map, index_user_map, model = create_node2vec_annoy_from_ratings(train_liked_df)


Computing transition probabilities: 100%|█| 12258/12258 [02:20<00:00, 86.96it/s]
Generating walks (CPU: 1): 100%|████████████████| 80/80 [23:56<00:00, 17.96s/it]


In [113]:
#evaluate the recommendation system using Precision@k.
#Annoy returns neighbours, form this set of specified number of neighbours, 
#we are able to get the top most frequently liked movies as the top-k in Precision@k.
#this means Precision@k is a suitable measure, and given the nature of ANN and the associated movies, 
#is the preferred for performance metrics over just the normal precision rate.

def recommend_movies(target_user, neighbour_ids, train_liked_df, top_k=4):
    #for the given target user, aggregate liked movies from neighbours,
    #remove movies the target user already saw in training,
    #rank by frequency (how many neighbours liked the movie),
    #and return the top_k recommendations.

    #movies the target user already liked in training (to avoid recommending these)
    target_seen = set(train_liked_df[train_liked_df['userId'] == target_user]["movieId"])

    candidate_movies = []
    for neighbour in neighbour_ids:
        neighbour_movies = set(train_liked_df[train_liked_df['userId'] == neighbour]['movieId'])
        candidate_movies.extend(list(neighbour_movies))

    #count how often each movie appears among neighbours and filter out already seen movies.
    movie_freq = {}
    for movie in candidate_movies:
        if movie not in target_seen:
            movie_freq[movie] = movie_freq.get(movie, 0) + 1

    #turn the movie frequency dictionary into a list of (movie, frequency) pairs
    movie_freq_list = list(movie_freq.items())
    
    #sort the list by:
    #frequency in descending order (highest first)
    #if two movies have the same frequency, sort by movieId in ascending order
    def sort_key(item):
        movie_id = item[0]
        freq = item[1]
        return (-freq, movie_id)

    sorted_movies = sorted(movie_freq_list, key=sort_key)

    #extract only the movie Ids of the top_k items
    recommended_movies = []
    for i in range(min(top_k, len(sorted_movies))):
        movie_id = sorted_movies[i][0]
        recommended_movies.append(movie_id)

    return recommended_movies

#returns overall average precision and a dictionary of per-user precision scores.
def evaluate_precision_at_k(ann_index, user_index_map, index_user_map,
                            train_liked_df, test_liked_df,
                            k_neighbours=3, k_recs=10):

    #for each user:
    #retrieve the k closest neighbours using the Annoy index.
    #aggregate neighbours' liked movies to generate top k recommendations.
    #compute precision@K as (# recommended movies that are in the user's held-out liked set) / k_recs.
    user_precision = {}
    for user_node, target_index in user_index_map.items():
        #extract the target user Id from string
        target_user = int(user_node.split("_")[1])
        
        #get k+1 nearest neighbours (target user included), then remove the target itself.

        nearest_indices = ann_index.get_nns_by_item(target_index, k_neighbours + 1)

        #create a new list to hold the neighbour indices, excluding the target user
        filtered_indices = []
    
        for i in nearest_indices:
            if i != target_index:
                filtered_indices.append(i)
        
        #keep only the first k_neighbours
        nearest_indices = filtered_indices[:k_neighbours]

        neighbour_ids = [index_user_map[i] for i in nearest_indices]

        #get recommendations from neighbours
        recommended = recommend_movies(target_user, neighbour_ids, train_liked_df, top_k=k_recs)
        #compare against the target user's held-out liked movies (test set)
        test_liked = set(test_liked_df[test_liked_df['userId'] == target_user]['movieId'])

        #convert recommended list to a set
        recommended_set = set(recommended)
        
        #find the intersection (i.e. common movies) between recommended and test_liked
        common_movies = recommended_set.intersection(test_liked)
        
        #count how many recommended movies are actually liked (true positives)
        num_correct = len(common_movies)
        
        #if k_recs is more than 0, calculate precision; otherwise, set it to 0
        if k_recs > 0:
            precision = num_correct / float(k_recs)
        else:
            precision = 0

        user_precision[target_user] = precision

    #check if there are any users in the dictionary
    if len(user_precision) > 0:
        total_precision = sum(user_precision.values())
        num_users = len(user_precision)
        overall_precision = total_precision / num_users
    else:
        overall_precision = 0
    return overall_precision, user_precision

#run function to evaluate Precision@K
overall_prec_at_k, per_user_prec_at_k = evaluate_precision_at_k(
    ann_index, user_index_map, index_user_map,
    train_liked_df, test_liked_df,
    k_neighbours=3, k_recs=1
)
print("Overall Precision@10:", overall_prec_at_k)


Overall Precision@10: 0.2702797818053845
