In [29]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [30]:
animes_df = pd.read_csv("data/animes.csv")
ratings_df = pd.read_csv("data/reviews.csv")
profiles_df = pd.read_csv("data/profiles.csv")

In [31]:
profiles_df = profiles_df.drop_duplicates(subset="profile", keep="first").reset_index(drop=True)
ratings_df = ratings_df.drop_duplicates().reset_index(drop=True)
animes_df = animes_df.drop_duplicates(subset="uid", keep="first").reset_index(drop=True)

In [32]:
# Extract start year from aired
animes_df["year"] = animes_df["aired"].str.extract(r'(\d{4})').astype(float)

# Convert stringified lists to actual lists
import ast
animes_df['genre'] = animes_df['genre'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
# One-hot encode genres
genre_dummies = animes_df["genre"].str.join('|').str.get_dummies()

# Select numeric features
num_features = animes_df[["year", "score"]].fillna(0)

# Normalize numeric features
scaler = MinMaxScaler()
num_features = pd.DataFrame(
    scaler.fit_transform(num_features),
    columns=num_features.columns,
    index=animes_df.index
)

# Combine everything
item_train = pd.concat([num_features, genre_dummies], axis=1)
print(item_train.head())

       year     score  Action  Adventure  Cars  Comedy  Dementia  Demons  \
0  0.997031  0.955580       0          0     0       1         0       0   
1  0.996536  0.956663       0          0     0       0         0       0   
2  0.998021  0.956663       0          1     0       0         0       0   
3  0.994062  1.000000       1          1     0       1         0       0   
4  0.998021  0.956663       1          0     0       0         0       0   

   Drama  Ecchi  ...  Shounen Ai  Slice of Life  Space  Sports  Super Power  \
0      1      0  ...           0              0      0       1            0   
1      1      0  ...           0              0      0       0            0   
2      1      0  ...           0              0      0       0            0   
3      1      0  ...           0              0      0       0            0   
4      0      0  ...           0              0      0       0            0   

   Supernatural  Thriller  Vampire  Yaoi  Yuri  
0             0    

In [33]:
## --- User features ---
import ast

# Convert stringified lists to real lists of ints
def parse_favorites(x):
    try:
        return [int(a) for a in ast.literal_eval(x)]
    except:
        return []  # fallback for malformed entries

profiles_df["favorites_anime"] = profiles_df["favorites_anime"].apply(parse_favorites)
# Step 2: Build user_train
user_features = []
# Create a mapping from anime UID to row index in item_train
anime_id_to_idx = {uid: idx for idx, uid in enumerate(animes_df["uid"])}

for _, row in profiles_df.iterrows():
    favs = row["favorites_anime"]
    
    # Get indices of these anime in item_train
    indices = [anime_id_to_idx[uid] for uid in favs if uid in anime_id_to_idx]
    
    if indices:
        # Average features of the anime to get user preference vector
        user_vec = item_train.iloc[indices].mean(axis=0)
    else:
        # If no valid favorites, use zero vector
        user_vec = np.zeros(item_train.shape[1])
    
    user_features.append(user_vec)

# Create user_train DataFrame
user_train = pd.DataFrame(user_features, index=profiles_df["profile"])
print(user_train.iloc[0].to_dict())

{'year': 0.9946066303809996, 'score': 0.8813109425785483, 'Action': 0.4, 'Adventure': 0.4, 'Cars': 0.0, 'Comedy': 0.45, 'Dementia': 0.0, 'Demons': 0.05, 'Drama': 0.45, 'Ecchi': 0.0, 'Fantasy': 0.25, 'Game': 0.05, 'Harem': 0.05, 'Hentai': 0.0, 'Historical': 0.1, 'Horror': 0.1, 'Josei': 0.05, 'Kids': 0.0, 'Magic': 0.0, 'Martial Arts': 0.05, 'Mecha': 0.05, 'Military': 0.05, 'Music': 0.1, 'Mystery': 0.25, 'Parody': 0.0, 'Police': 0.05, 'Psychological': 0.15, 'Romance': 0.45, 'Samurai': 0.0, 'School': 0.1, 'Sci-Fi': 0.1, 'Seinen': 0.15, 'Shoujo': 0.05, 'Shoujo Ai': 0.0, 'Shounen': 0.3, 'Shounen Ai': 0.0, 'Slice of Life': 0.3, 'Space': 0.0, 'Sports': 0.0, 'Super Power': 0.15, 'Supernatural': 0.4, 'Thriller': 0.0, 'Vampire': 0.05, 'Yaoi': 0.0, 'Yuri': 0.0}


In [35]:
user_train = user_train.fillna(0)
print(item_train.isna().sum().sum())   # total NaNs in item_train
print(user_train.isna().sum().sum())   # total NaNs in user_train

0
0


In [36]:
anime_to_idx = {uid: i for i, uid in enumerate(animes_df["uid"])}
user_to_idx  = {name: i for i, name in enumerate(profiles_df["profile"])}


In [43]:
# Create new columns for index lookups
ratings_df["user_idx"] = ratings_df["profile"].map(user_to_idx)
ratings_df["anime_idx"] = ratings_df["anime_uid"].map(anime_to_idx)


In [44]:
# Drop any rows where a mapping failed
ratings_df = ratings_df.dropna(subset=["user_idx", "anime_idx", "score"]).reset_index(drop=True)

# Convert to integer indices
ratings_df["user_idx"] = ratings_df["user_idx"].astype(int)
ratings_df["anime_idx"] = ratings_df["anime_idx"].astype(int)

In [49]:
user_matrix = user_train.to_numpy()
item_matrix = item_train.to_numpy()

u_train = user_matrix[ratings_df["user_idx"].values]
i_train = item_matrix[ratings_df["anime_idx"].values]

# Extract target scores
y_train = ratings_df["score"].values

In [50]:
print("Shapes:")
print("u_train:", u_train.shape)
print("i_train:", i_train.shape)
print("y_train:", y_train.shape)


Shapes:
u_train: (130519, 45)
i_train: (130519, 45)
y_train: (130519,)


In [51]:
# pick a random index
idx = np.random.randint(0, len(y_train))

print("Review index:", idx)
print("Profile:", ratings_df.iloc[idx]["profile"])
print("Anime UID:", ratings_df.iloc[idx]["anime_uid"])
print("Score (y_train):", y_train[idx])

# Compare with original matrices
user_idx = ratings_df.iloc[idx]["user_idx"]
anime_idx = ratings_df.iloc[idx]["anime_idx"]

print("u_train[idx] equals user_matrix[user_idx]?:", np.allclose(u_train[idx], user_matrix[user_idx]))
print("i_train[idx] equals item_matrix[anime_idx]?:", np.allclose(i_train[idx], item_matrix[anime_idx]))

Review index: 125310
Profile: TakamakiJoker
Anime UID: 94
Score (y_train): 3
u_train[idx] equals user_matrix[user_idx]?: True
i_train[idx] equals item_matrix[anime_idx]?: True


In [52]:
for i in range(5):
    print(f"Review {i}: profile={ratings_df.iloc[i]['profile']}, anime_uid={ratings_df.iloc[i]['anime_uid']}, score={y_train[i]}")


Review 0: profile=DesolatePsyche, anime_uid=34096, score=8
Review 1: profile=baekbeans, anime_uid=34599, score=10
Review 2: profile=skrn, anime_uid=28891, score=7
Review 3: profile=edgewalker00, anime_uid=2904, score=9
Review 4: profile=aManOfCulture99, anime_uid=4181, score=10


In [55]:
from sklearn.model_selection import train_test_split

# Split 80% train, 20% test
(u_train_train, u_train_test,
 i_train_train, i_train_test,
 y_train_train, y_train_test) = train_test_split(
    u_train, i_train, y_train,
    test_size=0.2,   # 20% test
    random_state=42, # for reproducibility
    shuffle=True     # shuffle before splitting
)


In [56]:
from keras import layers, Model, Input
import tensorflow as tf

user_dim = u_train.shape[1]
item_dim = i_train.shape[1]
embedding_dim = 32  # size of the learned embedding

# --- User tower ---
user_input = Input(shape=(user_dim,), name="user_input")
user_emb = layers.Dense(64, activation="relu")(user_input)
user_emb = layers.Dense(embedding_dim, activation="relu")(user_emb)

# --- Item tower ---
item_input = Input(shape=(item_dim,), name="item_input")
item_emb = layers.Dense(64, activation="relu")(item_input)
item_emb = layers.Dense(embedding_dim, activation="relu")(item_emb)

# --- Interaction ---
# dot product between embeddings to get a single score
dot = layers.Dot(axes=1)([user_emb, item_emb])

# optional: scale output to [0, 10] since y_train ranges 0–10
output = layers.Activation("linear")(dot)

# --- Build model ---
model = Model(inputs=[user_input, item_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

model.summary()


In [57]:
model.fit(
    [u_train_train, i_train_train],  # train inputs
    y_train_train,                   # train labels
    batch_size=512,
    epochs=10,
    shuffle=True,
    validation_data=([u_train_test, i_train_test], y_train_test)  # test/validation set
)



Epoch 1/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 11.1295 - mae: 2.5279 - val_loss: 4.7703 - val_mae: 1.7430
Epoch 2/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.5312 - mae: 1.6849 - val_loss: 4.4939 - val_mae: 1.6962
Epoch 3/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.3293 - mae: 1.6376 - val_loss: 4.3413 - val_mae: 1.6365
Epoch 4/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.2183 - mae: 1.6092 - val_loss: 4.2840 - val_mae: 1.6014
Epoch 5/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.1473 - mae: 1.5898 - val_loss: 4.2292 - val_mae: 1.6052
Epoch 6/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.0881 - mae: 1.5734 - val_loss: 4.2046 - val_mae: 1.6005
Epoch 7/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step -

<keras.src.callbacks.history.History at 0x775a1365faa0>

In [58]:
# pick the first 10 examples from the validation set
num_samples = 10
u_sample = u_train_test[:num_samples]
i_sample = i_train_test[:num_samples]
y_actual = y_train_test[:num_samples]

# get predictions
y_pred = model.predict([u_sample, i_sample]).flatten()

df_compare = pd.DataFrame({
    "Actual Score": y_actual,
    "Predicted Score": y_pred
})

print(df_compare)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
   Actual Score  Predicted Score
0             9         6.119696
1             8         5.805788
2             9         7.343047
3             4         5.925821
4             6         5.369051
5             3         4.991215
6            10         8.527205
7             7         6.908673
8             8         7.736044
9             5         4.971188
