In [1]:
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.optimizers import Adam
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Data processing & cleaning**

Extract Zip files

In [2]:
# Path to the ZIP file
zip_files = [
    '/content/drive/MyDrive/CS547/_24Fall_CS547_IR-Project/AnimeList.csv.zip',
    '/content/drive/MyDrive/CS547/_24Fall_CS547_IR-Project/UserList.csv.zip',
    '/content/drive/MyDrive/CS547/_24Fall_CS547_IR-Project/UserAnimeList.csv.zip'
]

# Extract location
extract_to_path = '/content/sample_data'

# Extract the ZIP file
for zip_file_path in zip_files:
  with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
      zip_ref.extractall(extract_to_path)
      print(f"Files extracted to {extract_to_path}")

  print(f"All files has been extracted")

Files extracted to /content/sample_data
All files has been extracted
Files extracted to /content/sample_data
All files has been extracted
Files extracted to /content/sample_data
All files has been extracted


Load data

In [3]:
animes_list = pd.read_csv("/content/sample_data/AnimeList.csv")
user_list = pd.read_csv("/content/sample_data/UserList.csv")
user_anime_list = pd.read_csv("/content/sample_data/UserAnimeList.csv", nrows=10000)
print(f'Anime List Shape: {animes_list.shape}')
print(f'User List Shape: {user_list.shape}')
print(f'User Anime List Shape: {user_anime_list.shape}')

Anime List Shape: (14478, 31)
User List Shape: (302675, 17)
User Anime List Shape: (10000, 11)


In [8]:
user_anime_list.head(2)

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,karthiga,21,586,0000-00-00,0000-00-00,9,1,,0,1362307973,
1,karthiga,59,26,0000-00-00,0000-00-00,7,2,,0,1362923691,


In [8]:
anime_list.head(2)

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,background,premiered,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme
0,11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,...,Inu x Boku SS was licensed by Sentai Filmworks...,Winter 2012,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ..."
1,2104,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,,Spring 2007,Unknown,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi..."


In [9]:
user_list.head(2)

Unnamed: 0,username,user_id,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,access_rank,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes
0,karthiga,2255153,3,49,1,0,0,55.31,Female,"Chennai, India",1990-04-29,,2013-03-03,2014-02-04 01:32:00,7.43,0.0,3391.0
1,RedvelvetDaisuki,1897606,61,396,39,0,206,118.07,Female,Manila,1995-01-01,,2012-12-13,1900-05-13 02:47:00,6.78,80.0,7094.0


Merge data

In [4]:
data = user_anime_list.merge(user_list, on='username', how='left')
data = data.merge(animes_list, on='anime_id', how='left')
data = pd.DataFrame(data)
# Preview merged data
print(data.shape)
data.head(2)

(10000, 57)


Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,...,background,premiered,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme
0,karthiga,21,586,0000-00-00,0000-00-00,9,1,,0,1362307973,...,Several anime-original arcs have been adapted ...,Fall 1999,Sundays at 09:30 (JST),"{'Adaptation': [{'mal_id': 13, 'type': 'manga'...","Fuji TV, TAP, Shueisha","Funimation, 4Kids Entertainment",Toei Animation,"Action, Adventure, Comedy, Super Power, Drama,...","['#01: ""We Are! (ウィーアー!)"" by Hiroshi Kitadani ...","['#01: ""memories"" by Maki Otsuki (eps 1-30)', ..."
1,karthiga,59,26,0000-00-00,0000-00-00,7,2,,0,1362923691,...,"The original episodes 9 and 18 are ""recap"" epi...",Spring 2002,Wednesdays at 02:20 (JST),"{'Adaptation': [{'mal_id': 107, 'type': 'manga...","TBS, Pioneer LDC","Funimation, Geneon Entertainment USA",Madhouse,"Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen","['""Let Me Be With You"" by ROUND TABLE feat. Ni...","['#1: ""Raison d&#039;Être"" by Rie Tanaka (eps ..."


In [5]:
# Encode users and anime
user_ids = data["user_id"].astype("category").cat.codes
anime_ids = data["anime_id"].astype("category").cat.codes

# Add encoded IDs to the dataset
data["user_id_encoded"] = user_ids
data["anime_id_encoded"] = anime_ids

data[['user_id_encoded', 'anime_id_encoded', 'my_score']].head(2)


Unnamed: 0,user_id_encoded,anime_id_encoded,my_score
0,11,11,9
1,11,37,7


Data cleaning

In [7]:
# Count occurrences of each score
score_counts = data['my_score'].value_counts()

print("Score Counts:")
print(score_counts)


Score Counts:
my_score
0     4049
5     1440
7     1022
8      956
6      892
4      688
9      469
10     296
3      136
2       39
1       13
Name: count, dtype: int64


In [6]:
# data cleaning
data['genre'] = data['genre'].apply(lambda x: str(x) if not isinstance(x, str) else x)
#data['genre'] = data['genre'].fillna(data['title'])
data[data['genre'].isna()]


Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,...,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme,user_id_encoded,anime_id_encoded


In [8]:
print(data['genre'].apply(type).value_counts())
data = data[~data['genre'].apply(lambda x: isinstance(x, float) or pd.isna(x))] # Remove rows where 'genre' is of type float

print(data['genre'].apply(type).value_counts())

genre
<class 'str'>    10000
Name: count, dtype: int64
genre
<class 'str'>    10000
Name: count, dtype: int64


**Ranking**

In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


def preprocess_data(data):
    """
    Preprocess the data by computing genre embeddings once.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    data['genre_embedding'] = data['genre'].apply(
        lambda x: model.encode(x) if isinstance(x, str) else np.zeros(384)
    )
    return data


def rank_anime_warm(userid, anime_list, data):
    """
    Rank anime for a given user based on genre similarity (70%) and collaborative filtering (30%).
    """
    # Step 1: Map anime names to IDs
    anime_name_to_id = data.set_index('title')['anime_id_encoded'].to_dict()
    missing_animes = [anime for anime in anime_list if anime not in anime_name_to_id]
    if missing_animes:
        raise ValueError(f"Anime names not found in the dataset: {', '.join(missing_animes)}")

    anime_ids = [anime_name_to_id[anime_name] for anime_name in anime_list]

    # Step 2: User's watching history
    user_data = data[data['user_id_encoded'] == userid]
    watched_animes = user_data[['anime_id_encoded', 'my_score']]

    # Step 3: User-item matrix and similarity matrix
    user_anime_matrix = data.pivot_table(
        index='user_id_encoded', columns='anime_id_encoded', values='my_score', aggfunc='mean'
    ).fillna(0)
    user_similarity_matrix = cosine_similarity(user_anime_matrix)
    user_similarities = pd.DataFrame(
        user_similarity_matrix, index=user_anime_matrix.index, columns=user_anime_matrix.index
    )

    # Step 4: Compute user's genre preference
    user_genre_embeddings = data[
        data['anime_id_encoded'].isin(watched_animes['anime_id_encoded'])
    ]['genre_embedding'].tolist()
    user_genre_vector = (
        np.mean(user_genre_embeddings, axis=0) if user_genre_embeddings else np.zeros(384)
    )

    # Step 5: Rank anime based on similarity and collaborative filtering
    anime_ranking = []
    for anime_id in anime_ids:
        # User's rating for the anime
        user_rating = user_data[user_data['anime_id_encoded'] == anime_id]['my_score'].values
        user_rating = user_rating[0] if user_rating.size > 0 else 0

        # Collaborative filtering (CF) score
        similar_users = data[data['anime_id_encoded'] == anime_id].copy()
        similar_users['similarity'] = similar_users['user_id_encoded'].apply(
            lambda x: user_similarities.loc[userid, x]
        )
        top_similar_users = similar_users[similar_users['my_score'] > 0].nlargest(3, 'similarity')
        cf_score = (
            np.average(top_similar_users['my_score'], weights=top_similar_users['similarity'])
            if not top_similar_users.empty
            else 0
        )

        # If user's rating is 0, use CF score as a fallback
        if user_rating == 0:
            user_rating = cf_score

        # Genre similarity score
        anime_genre_vector = data.loc[data['anime_id_encoded'] == anime_id, 'genre_embedding'].values[0]
        genre_similarity = cosine_similarity([user_genre_vector], [anime_genre_vector])[0][0]

        # Final score (weighted combination)
        final_score = 0.7 * genre_similarity + 0.3 * user_rating
        anime_title = list(anime_name_to_id.keys())[list(anime_name_to_id.values()).index(anime_id)]
        anime_ranking.append((anime_title, final_score))

    # Sort by final score
    ranked_animes = sorted(anime_ranking, key=lambda x: x[1], reverse=True)
    return ranked_animes

In [20]:
userid = 12  # Example user ID
anime_list = ['Naruto', 'One Piece', 'Dragon Ball']  # List of anime names

df = data[['anime_id_encoded', 'user_id_encoded', 'title', 'genre', 'my_score']]
df = preprocess_data(df)
ranked_animes = rank_anime_warm(userid, anime_list, df)  # Call the function to rank the anime list for the given user

print(ranked_animes)

[('One Piece', 3.5966069877147673), ('Naruto', 2.986481004222897), ('Dragon Ball', 2.974612283706665)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['genre_embedding'] = data['genre'].apply(


In [26]:
data[data["user_id_encoded"]==12][['title','genre','my_score','anime_id_encoded']]

Unnamed: 0,title,genre,my_score,anime_id_encoded
1563,One Piece,"Action, Adventure, Comedy, Super Power, Drama,...",10,11
1564,Full Metal Panic!,"Action, Military, Sci-Fi, Comedy, Mecha",10,48
1565,Full Metal Panic? Fumoffu,"Action, Comedy, School",0,49
1566,Full Metal Panic! The Second Raid,"Action, Military, Mecha",0,50
1567,Mobile Suit Gundam,"Action, Space, Mecha, Military, Sci-Fi",0,56
...,...,...,...,...
1709,Hamatora The Animation,"Mystery, Comedy, Super Power, Drama",0,4307
1710,Seitokai Yakuindomo*,"Slice of Life, Comedy, School, Shounen",0,4318
1711,Sekai Seifuku: Bouryaku no Zvezda,"Action, Comedy, Fantasy",0,4333
1712,Witch Craft Works,"Action, Fantasy, Magic, Seinen, Supernatural",0,4342
