In [7]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [8]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
animes_df = pd.read_csv("data/animes.csv")
ratings_df = pd.read_csv("data/reviews.csv")
profiles_df = pd.read_csv("data/profiles.csv")

In [27]:
# See the first 5 rows
print(animes_df.head())
print(ratings_df.head())
print(profiles_df.head())

     uid  ...                                               link
0  28891  ...  https://myanimelist.net/anime/28891/Haikyuu_Se...
1  23273  ...  https://myanimelist.net/anime/23273/Shigatsu_w...
2  34599  ...  https://myanimelist.net/anime/34599/Made_in_Abyss
3   5114  ...  https://myanimelist.net/anime/5114/Fullmetal_A...
4  31758  ...  https://myanimelist.net/anime/31758/Kizumonoga...

[5 rows x 12 columns]
      uid  ...                                           link
0  255938  ...  https://myanimelist.net/reviews.php?id=255938
1  259117  ...  https://myanimelist.net/reviews.php?id=259117
2  253664  ...  https://myanimelist.net/reviews.php?id=253664
3    8254  ...    https://myanimelist.net/reviews.php?id=8254
4  291149  ...  https://myanimelist.net/reviews.php?id=291149

[5 rows x 7 columns]
           profile  ...                                             link
0   DesolatePsyche  ...   https://myanimelist.net/profile/DesolatePsyche
1        baekbeans  ...        https://myanimel

In [4]:
# Extract start year from aired
animes_df["year"] = animes_df["aired"].str.extract(r'(\d{4})').astype(float)

# Convert stringified lists to actual lists
import ast
animes_df['genre'] = animes_df['genre'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
# One-hot encode genres
genre_dummies = animes_df["genre"].str.join('|').str.get_dummies()

# Select numeric features
num_features = animes_df[["year", "score"]].fillna(0)

# Normalize numeric features
scaler = MinMaxScaler()
num_features = pd.DataFrame(
    scaler.fit_transform(num_features),
    columns=num_features.columns,
    index=animes_df.index
)

# Combine everything
item_train = pd.concat([num_features, genre_dummies], axis=1)
print(item_train.head())

       year     score  Action  Adventure  Cars  Comedy  Dementia  Demons  \
0  0.997031  0.955580       0          0     0       1         0       0   
1  0.996536  0.956663       0          0     0       0         0       0   
2  0.998021  0.956663       0          1     0       0         0       0   
3  0.994062  1.000000       1          1     0       1         0       0   
4  0.998021  0.956663       1          0     0       0         0       0   

   Drama  Ecchi  ...  Shounen Ai  Slice of Life  Space  Sports  Super Power  \
0      1      0  ...           0              0      0       1            0   
1      1      0  ...           0              0      0       0            0   
2      1      0  ...           0              0      0       0            0   
3      1      0  ...           0              0      0       0            0   
4      0      0  ...           0              0      0       0            0   

   Supernatural  Thriller  Vampire  Yaoi  Yuri  
0             0    

In [5]:
## --- User features ---
import ast

# Convert stringified lists to real lists of ints
def parse_favorites(x):
    try:
        return [int(a) for a in ast.literal_eval(x)]
    except:
        return []  # fallback for malformed entries

profiles_df["favorites_anime"] = profiles_df["favorites_anime"].apply(parse_favorites)
# Step 2: Build user_train
user_features = []
# Create a mapping from anime UID to row index in item_train
anime_id_to_idx = {uid: idx for idx, uid in enumerate(animes_df["uid"])}

for _, row in profiles_df.iterrows():
    favs = row["favorites_anime"]
    
    # Get indices of these anime in item_train
    indices = [anime_id_to_idx[uid] for uid in favs if uid in anime_id_to_idx]
    
    if indices:
        # Average features of the anime to get user preference vector
        user_vec = item_train.iloc[indices].mean(axis=0)
    else:
        # If no valid favorites, use zero vector
        user_vec = np.zeros(item_train.shape[1])
    
    user_features.append(user_vec)

# Create user_train DataFrame
user_train = pd.DataFrame(user_features, index=profiles_df["profile"])
print(user_train.iloc[0].to_dict())


{'year': 0.9946066303809996, 'score': 0.8813109425785483, 'Action': 0.4, 'Adventure': 0.4, 'Cars': 0.0, 'Comedy': 0.45, 'Dementia': 0.0, 'Demons': 0.05, 'Drama': 0.45, 'Ecchi': 0.0, 'Fantasy': 0.25, 'Game': 0.05, 'Harem': 0.05, 'Hentai': 0.0, 'Historical': 0.1, 'Horror': 0.1, 'Josei': 0.05, 'Kids': 0.0, 'Magic': 0.0, 'Martial Arts': 0.05, 'Mecha': 0.05, 'Military': 0.05, 'Music': 0.1, 'Mystery': 0.25, 'Parody': 0.0, 'Police': 0.05, 'Psychological': 0.15, 'Romance': 0.45, 'Samurai': 0.0, 'School': 0.1, 'Sci-Fi': 0.1, 'Seinen': 0.15, 'Shoujo': 0.05, 'Shoujo Ai': 0.0, 'Shounen': 0.3, 'Shounen Ai': 0.0, 'Slice of Life': 0.3, 'Space': 0.0, 'Sports': 0.0, 'Super Power': 0.15, 'Supernatural': 0.4, 'Thriller': 0.0, 'Vampire': 0.05, 'Yaoi': 0.0, 'Yuri': 0.0}


In [6]:
user_train = user_train.fillna(0)
print(item_train.isna().sum().sum())   # total NaNs in item_train
print(user_train.isna().sum().sum())   # total NaNs in user_train

0
0


In [None]:


top_n = 10
all_recommendations = {}

for idx, profile in enumerate(user_train.index):
    user_vec = user_train.iloc[idx].values.reshape(1, -1)
    sims = cosine_similarity(user_vec, item_train.values)[0]
    top_idx = sims.argsort()[-top_n:][::-1]
    all_recommendations[profile] = animes_df.iloc[top_idx][['uid','title','score','link']]

# Example: show recommendations for first user
print(all_recommendations[user_train.index[0]])


         uid  ...                                               link
17495  31933  ...  https://myanimelist.net/anime/31933/JoJo_no_Ki...
723    31933  ...  https://myanimelist.net/anime/31933/JoJo_no_Ki...
858      154  ...      https://myanimelist.net/anime/154/Shaman_King
19073    154  ...      https://myanimelist.net/anime/154/Shaman_King
5395     969  ...  https://myanimelist.net/anime/969/Tsubasa_Chro...
14      4181  ...  https://myanimelist.net/anime/4181/Clannad__Af...
3091    4181  ...  https://myanimelist.net/anime/4181/Clannad__Af...
8174   36491  ...  https://myanimelist.net/anime/36491/Doupo_Cang...
16185   2486  ...  https://myanimelist.net/anime/2486/Rumiko_Taka...
853      479  ...  https://myanimelist.net/anime/479/Ueki_no_Housoku

[10 rows x 4 columns]


In [9]:
# Get user vector (1 x num_features)
user_vec = user_train.iloc[0].values.reshape(1, -1)
print(user_vec)

# Compute cosine similarity with all items
similarities = cosine_similarity(user_vec, item_train.values)[0]

# Get indices of top-N most similar items
top_n = 10
top_idx = similarities.argsort()[-top_n:][::-1]

# Show recommended anime
recommended_anime = animes_df.iloc[top_idx][['uid','title','genre','score','link']]
# Print nicely
for i, row in recommended_anime.iterrows():
    print(f"{i+1}. {row['title']} (score: {row['score']}) - {row['link']}")

[[0.99460663 0.88131094 0.4        0.4        0.         0.45
  0.         0.05       0.45       0.         0.25       0.05
  0.05       0.         0.1        0.1        0.05       0.
  0.         0.05       0.05       0.05       0.1        0.25
  0.         0.05       0.15       0.45       0.         0.1
  0.1        0.15       0.05       0.         0.3        0.
  0.3        0.         0.         0.15       0.4        0.
  0.05       0.         0.        ]]
17496. JoJo no Kimyou na Bouken Part 4: Diamond wa Kudakenai (score: 8.6) - https://myanimelist.net/anime/31933/JoJo_no_Kimyou_na_Bouken_Part_4__Diamond_wa_Kudakenai
724. JoJo no Kimyou na Bouken Part 4: Diamond wa Kudakenai (score: 8.6) - https://myanimelist.net/anime/31933/JoJo_no_Kimyou_na_Bouken_Part_4__Diamond_wa_Kudakenai
859. Shaman King (score: 7.82) - https://myanimelist.net/anime/154/Shaman_King
19074. Shaman King (score: 7.82) - https://myanimelist.net/anime/154/Shaman_King
5396. Tsubasa Chronicle 2nd Season (score: 7.6

In [12]:
# Filter only DesolatePsycheâ€™s reviews
user_name = "DesolatePsyche"
user_ratings = ratings_df[ratings_df["profile"] == user_name]
for rating in user_ratings.itertuples():
    anime_id = rating.anime_uid
    score = rating.score
    print(f"Anime ID: {anime_id}, Rating: {score}")

Anime ID: 34096, Rating: 8
Anime ID: 12403, Rating: 6
Anime ID: 35073, Rating: 6
Anime ID: 23225, Rating: 7
Anime ID: 2035, Rating: 8
Anime ID: 24921, Rating: 9
Anime ID: 30279, Rating: 7
Anime ID: 77, Rating: 7
Anime ID: 31173, Rating: 9
Anime ID: 10153, Rating: 7
Anime ID: 31240, Rating: 9
Anime ID: 31812, Rating: 8
Anime ID: 877, Rating: 7
Anime ID: 12365, Rating: 9
Anime ID: 19, Rating: 8
Anime ID: 21939, Rating: 9
Anime ID: 37491, Rating: 10
Anime ID: 32281, Rating: 9
Anime ID: 33352, Rating: 10
Anime ID: 33674, Rating: 10
Anime ID: 205, Rating: 8
Anime ID: 32937, Rating: 8
Anime ID: 31964, Rating: 7
Anime ID: 35062, Rating: 9
Anime ID: 30015, Rating: 9
Anime ID: 32615, Rating: 8
Anime ID: 967, Rating: 7
Anime ID: 34599, Rating: 8
Anime ID: 1520, Rating: 6
Anime ID: 31765, Rating: 9
Anime ID: 35466, Rating: 9
Anime ID: 5074, Rating: 7
Anime ID: 345, Rating: 8
Anime ID: 18245, Rating: 9
Anime ID: 1699, Rating: 6
Anime ID: 477, Rating: 7
Anime ID: 319, Rating: 7
Anime ID: 34096, Rat

In [17]:
# Get user vector (1 x num_features)
user_vec = user_train.iloc[0].values.reshape(1, -1)

# Compute cosine similarity with all items
similarities = cosine_similarity(user_vec, item_train.values)[0]

# Store predictions in a DataFrame
predicted_df = animes_df.copy()
predicted_df["predicted_score"] = similarities

# Get top-N recommended
top_n = 10
top_recommendations = predicted_df.sort_values("predicted_score", ascending=False).head(top_n)

user_name = "DesolatePsyche"
user_ratings = ratings_df[ratings_df["profile"] == user_name][["anime_uid", "score"]]

# Merge user ratings with predictions
comparison_df = user_ratings.merge(predicted_df, left_on="anime_uid", right_on="uid", how="inner")

# Rename columns for clarity
comparison_df = comparison_df.rename(columns={
    "score_x": "actual_score",   # user's given score
    "score_y": "anime_avg_score" # anime's general MAL score
})

# Keep only relevant columns
comparison_df = comparison_df[["title", "actual_score", "predicted_score", "anime_avg_score"]]

# Show the top 15 predictions sorted by predicted score
print(comparison_df.sort_values("predicted_score", ascending=False).head(20))



                         title  actual_score  predicted_score  anime_avg_score
27         Bakuman. 3rd Season             9         0.787132             8.61
26         Bakuman. 3rd Season             9         0.787132             8.61
173        Bakuman. 3rd Season             9         0.787132             8.61
172        Bakuman. 3rd Season             9         0.787132             8.61
171            Sakura Tsuushin             5         0.762602             6.26
149               Touka Gettan             7         0.757531             6.62
133       Imouto sae Ireba Ii.             5         0.750731             7.48
167         Sousei no Onmyouji             7         0.747441             7.40
114  Musaigen no Phantom World             7         0.742626             6.96
199                   Momokuri             7         0.738546             7.19
103          Kimi no Iru Machi             7         0.737707             7.07
112              Black Jack 21             6        