In [1]:
!pip install surprise



In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [3]:
# Load the datasets
anime_df = pd.read_csv("/Users/harrychang/Desktop/Y3S2/DSA4212/assignment2/assignment_2_data/assignment_2_anime.csv")
ratings_train_df = pd.read_csv("/Users/harrychang/Desktop/Y3S2/DSA4212/assignment2/assignment_2_data/assignment_2_ratings_train.csv")
ratings_test_df = pd.read_csv("/Users/harrychang/Desktop/Y3S2/DSA4212/assignment2/assignment_2_data/assignment_2_ratings_test.csv")

In [4]:
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [5]:
# Preprocess the data
anime_df["genre"] = anime_df["genre"].fillna("").apply(lambda x: " ".join(x.lower().split(", ")))
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,drama romance school supernatural,Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665
2,28977,Gintama°,action comedy historical parody samurai sci-fi...,TV,51,9.25,114262
3,9253,Steins;Gate,sci-fi thriller,TV,24,9.17,673572
4,9969,Gintama&#039;,action comedy historical parody samurai sci-fi...,TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,hentai,OVA,1,4.15,211
12290,5543,Under World,hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,hentai,OVA,1,4.98,175


In [6]:
# Create a TF-IDF matrix for anime genres
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(anime_df["genre"])

In [7]:
# Compute the cosine similarity between anime
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [8]:
# Create a mapping between anime_id and index
anime_id_mapping = {anime_id: index for index, anime_id in enumerate(anime_df["anime_id"])}


In [11]:
def predict_rating(user_id, anime_id):
    try:
        anime_index = anime_id_mapping[anime_id]

        # Calculate the cosine similarity scores for the target anime
        sim_scores = list(enumerate(cosine_sim[anime_index]))

        # Sort the anime based on their similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Find the indices of the top 10 most similar anime
        top_anime_indices = [i[0] for i in sim_scores[1:11]]

        # Calculate the weighted average rating of the top 10 most similar anime
        top_anime_ratings = ratings_train_df[ratings_train_df["anime_id"].isin(top_anime_indices)]
        weighted_rating_sum = np.sum(top_anime_ratings["rating"] * top_anime_ratings["anime_id"].apply(lambda x: cosine_sim[anime_index, anime_id_mapping[x]]))
        similarity_sum = np.sum(top_anime_ratings["anime_id"].apply(lambda x: cosine_sim[anime_index, anime_id_mapping[x]]))

        if similarity_sum == 0:
            # If no similar anime are found, return the mean rating of the target anime as a fallback
            return np.mean(ratings_train_df[ratings_train_df["anime_id"] == anime_id]["rating"])
        else:
            return weighted_rating_sum / similarity_sum

    except (IndexError, KeyError) as e:
        print(f"{type(e).__name__} encountered for user_id: {user_id} and anime_id: {anime_id}")
        return -1  # Return a default rating or any appropriate value

In [12]:
# Calculate the predicted ratings for the test dataset
ratings_test_df['predicted_rating'] = ratings_test_df.apply(
    lambda x: predict_rating(x['user_id'], x['anime_id']), axis=1)


KeyError encountered for user_id: 34240 and anime_id: 30913
KeyError encountered for user_id: 37442 and anime_id: 30913


In [13]:
# Remove rows with invalid predicted ratings (e.g., -1)
clean_ratings_test_df = ratings_test_df[ratings_test_df['predicted_rating'] != -1]

In [15]:
# Fill NaN values in 'rating' column with the mean rating
mean_rating = clean_ratings_test_df['rating'].mean()
clean_ratings_test_df['rating'].fillna(mean_rating, inplace=True)

# Fill NaN values in 'predicted_rating' column with the mean predicted rating
mean_predicted_rating = clean_ratings_test_df['predicted_rating'].mean()
clean_ratings_test_df['predicted_rating'].fillna(mean_predicted_rating, inplace=True)

# Calculate the MSE for content-based filtering
mse = mean_squared_error(clean_ratings_test_df['rating'], clean_ratings_test_df['predicted_rating'])
print("Content-based filtering MSE:", mse)

Content-based filtering MSE: 2.7956033897683574


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
