In [1]:
#Libraries
%matplotlib inline
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# # Preprocess original dataset

df = pd.read_csv('data/final_animedataset.csv')
print(f"Shape: {df.shape}")
df.head()

Shape: (35305695, 13)


Unnamed: 0,username,anime_id,my_score,user_id,gender,title,type,source,score,scored_by,rank,popularity,genre
0,karthiga,21,9,2255153,Female,One Piece,TV,Manga,8.54,423868,91.0,35,"Action, Adventure, Comedy, Super Power, Drama,..."
1,karthiga,59,7,2255153,Female,Chobits,TV,Manga,7.53,175388,1546.0,188,"Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen"
2,karthiga,74,7,2255153,Female,Gakuen Alice,TV,Manga,7.77,33244,941.0,1291,"Comedy, School, Shoujo, Super Power"
3,karthiga,120,7,2255153,Female,Fruits Basket,TV,Manga,7.77,167968,939.0,222,"Slice of Life, Comedy, Drama, Romance, Fantasy..."
4,karthiga,178,7,2255153,Female,Ultra Maniac,TV,Manga,7.26,9663,2594.0,2490,"Magic, Comedy, Romance, School, Shoujo"


In [3]:
# # Drop rows with empty username
df.dropna(subset = ['username'], inplace=True)
df.dropna(subset = ['rank'], inplace=True)
df.dropna(subset = ['genre'], inplace=True)

# Check for duplicate rows/null values
null = df.isnull().sum()
dupe = df.duplicated().sum()

print(f"Number of Null Values: {null} \n Number of Duplicate Rows: {dupe}")

Number of Null Values: username      0
anime_id      0
my_score      0
user_id       0
gender        0
title         0
type          0
source        0
score         0
scored_by     0
rank          0
popularity    0
genre         0
dtype: int64 
 Number of Duplicate Rows: 0


In [4]:
df = df[df["my_score"] != 0]
df = df[~df['genre'].str.contains('Hentai', na=False)]
unique_user = df['username'].nunique()
unique_anime = df['anime_id'].nunique()
num_ratings = len(df)
print(f"Total Number of Unique Users: {unique_user}")
print(f"Total Number of Unique Anime Titles: {unique_anime}")
print(f"Total Number of User Ratings: {num_ratings}")


Total Number of Unique Users: 113143
Total Number of Unique Anime Titles: 7633
Total Number of User Ratings: 21124780


In [5]:
# # What is the distribution of user ratings (my_score) across all anime?

# sns.histplot(data=df, x='my_score', bins=10)
# plt.title('Distribution of User Ratings')
# plt.xlabel('User Rating (my_score)')
# plt.ylabel('Count')
# plt.show()

In [6]:
# # Average User Scores Across Different Anime Genres
# # Explode the Genre (Convert the list of genres to its own row)
# # Keep this uncommented if you dont want your computer to explode
# df_reduced = df[['genre', 'my_score']].copy()

# # Split and explode 'genre'
# df_reduced['genre'] = df_reduced['genre'].str.split(', ')
# df_exploded = df_reduced.explode('genre')

# # Group by 'genre' and calculate the mean
# genre_avg_scores = df_exploded.groupby('genre')['my_score'].mean().reset_index()

# # Plot the results
# plt.figure(figsize=(12, 6))
# sns.barplot(data=genre_avg_scores, x='genre', y='my_score')
# plt.title('Average User Score by Genre')
# plt.xlabel('Genre')
# plt.ylabel('Average User Score')
# plt.xticks(rotation=90)
# plt.tight_layout()
# plt.show()

In [7]:
# # Do popular animes have higher scores (Bad Anime, Popular)

# sns.scatterplot(data=df, x='popularity', y='score')
# plt.title('Anime Score vs. Popularity')
# plt.xlabel('Popularity Rank')
# plt.ylabel('Anime Score')
# plt.gca().invert_xaxis()  # Invert x-axis if lower rank means higher popularity
# plt.show()

In [8]:
# # Do highly rated animes have more users rate them?

# sns.scatterplot(data=df, x='rank', y='scored_by')
# plt.title('Number of Users Scoring vs. Anime Rank')
# plt.xlabel('Anime Rank')
# plt.ylabel('Number of Users Who Scored')
# plt.gca().invert_xaxis()  # Invert x-axis if lower rank is better
# plt.show()

In [9]:
# Get descriptive statistics
# print(df['my_score'].describe())

In [10]:
# # Reduce memory usage by selecting only the 'genre' column
# df_genre = df[['genre']].copy()

# # Split and explode the 'genre' column
# df_genre['genre'] = df_genre['genre'].str.split(', ')
# df_exploded = df_genre.explode('genre')

# # Count the genres
# genre_counts = df_exploded['genre'].value_counts().reset_index()
# genre_counts.columns = ['genre', 'count']

# # Optimize data types
# genre_counts['count'] = genre_counts['count'].astype('int32')

# # Sort genres by count
# genre_counts = genre_counts.sort_values(by='count', ascending=False)

# # Select a bright color palette
# num_genres = genre_counts['genre'].nunique()
# palette_colors = sns.color_palette('bright', n_colors=num_genres)

# # Map colors to genres
# genre_color_mapping = dict(zip(genre_counts['genre'], palette_colors))

# # Plot the genre popularity with bright colors
# plt.figure(figsize=(12, 6))
# sns.barplot(
#     data=genre_counts,
#     x='genre',
#     y='count',
#     hue='genre',       # Assign hue to 'genre' to map colors correctly
#     dodge=False,       # Ensure bars are not dodged
#     palette=genre_color_mapping,
#     legend=False       # Hide legend if not needed
# )
# plt.title('Popularity of Genres')
# plt.xlabel('Genre')
# plt.ylabel('Count')
# plt.xticks(rotation=90)
# plt.tight_layout()
# plt.show()


In [11]:
# # Select necessary columns
# df_genre = df[['genre', 'my_score', 'gender']].copy()

# # Filter out rows with missing or zero 'my_score' values
# df_genre = df_genre[df_genre['my_score'] > 0]

# # Optimize data types to reduce memory usage
# df_genre['my_score'] = df_genre['my_score'].astype('float32')
# df_genre['gender'] = df_genre['gender'].astype('category')

# # Split the 'genre' column
# df_genre['genre'] = df_genre['genre'].str.split(', ')

# # Explode the 'genre' column
# df_exploded = df_genre.explode('genre')

# # Plot box plots
# plt.figure(figsize=(24, 8))  # Increase figure size
# sns.boxplot(
#     data=df_exploded,
#     x='genre',
#     y='my_score',
#     hue='gender',
#     width=0.8,       # Adjust the width of the boxes
#     fliersize=2      # Adjust the size of outlier markers
# )
# plt.title('User Ratings by Genres and Gender', fontsize=16)
# plt.xlabel('Genre', fontsize=14)
# plt.ylabel('User Rating (my_score)', fontsize=14)
# plt.xticks(rotation=90, fontsize=12)  # Rotate labels, align right, adjust font size
# plt.yticks(fontsize=12)
# plt.legend(title='Gender', fontsize=12, title_fontsize=14)
# plt.tight_layout()
# plt.show()

In [12]:
df = pd.read_csv('data/anime_dataset_encoded.csv')

In [5]:
df.head()

Unnamed: 0,username,anime_id,my_score,user_id,gender,title,type,source,score,scored_by,rank,popularity,genre
0,karthiga,21,9,2255153,Female,One Piece,TV,Manga,8.54,423868,91.0,35,"Action, Adventure, Comedy, Super Power, Drama,..."
1,karthiga,59,7,2255153,Female,Chobits,TV,Manga,7.53,175388,1546.0,188,"Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen"
2,karthiga,74,7,2255153,Female,Gakuen Alice,TV,Manga,7.77,33244,941.0,1291,"Comedy, School, Shoujo, Super Power"
3,karthiga,120,7,2255153,Female,Fruits Basket,TV,Manga,7.77,167968,939.0,222,"Slice of Life, Comedy, Drama, Romance, Fantasy..."
4,karthiga,178,7,2255153,Female,Ultra Maniac,TV,Manga,7.26,9663,2594.0,2490,"Magic, Comedy, Romance, School, Shoujo"


In [6]:
# Find all unique strings in the 'genre' column
unique_genres = set()
df['genre'].str.split(', ').apply(unique_genres.update)

unique_genres


# {'Action',
#  'Adventure',
#  'Cars',
#  'Comedy',
#  'Dementia',
#  'Demons',
#  'Drama',
#  'Ecchi',
#  'Fantasy',
#  'Game',
#  'Harem',
#  'Historical',
#  'Horror',
#  'Josei',
#  'Kids',
#  'Magic',
#  'Martial Arts',
#  'Mecha',
#  'Military',
#  'Music',
#  'Mystery',
#  'Parody',
#  'Police',
#  'Psychological',
#  'Romance',
#  'Samurai',
#  'School',
#  'Sci-Fi',
#  'Seinen',
#  'Shoujo',
#  'Shoujo Ai',
#  'Shounen',
#  'Shounen Ai',
#  'Slice of Life',
#  'Space',
#  'Sports',
#  'Super Power',
#  'Supernatural',
#  'Thriller',
#  'Vampire'}


# {'Action',
#  'Adventure',
#  'Cars',
#  'Comedy',
#  'Dementia',
#  'Demons',
#  'Drama',
#  'Ecchi',
#  'Fantasy',
#  'Game',
#  'Harem',
#  'Historical',
#  'Horror',
#  'Josei',
#  'Kids',
#  'Magic',
#  'Martial Arts',
#  'Mecha',
#  'Military',
#  'Music',
#  'Mystery',
#  'Parody',
#  'Police',
#  'Psychological',
#  'Romance',
#  'Samurai',
#  'School',
#  'Sci-Fi',
#  'Seinen',
#  'Shoujo',
#  'Shoujo Ai',
#  'Shounen',
#  'Shounen Ai',
#  'Slice of Life',
#  'Space',
#  'Sports',
#  'Super Power',
#  'Supernatural',
#  'Thriller',
#  'Vampire'}

{'Action',
 'Adventure',
 'Cars',
 'Comedy',
 'Dementia',
 'Demons',
 'Drama',
 'Ecchi',
 'Fantasy',
 'Game',
 'Harem',
 'Historical',
 'Horror',
 'Josei',
 'Kids',
 'Magic',
 'Martial Arts',
 'Mecha',
 'Military',
 'Music',
 'Mystery',
 'Parody',
 'Police',
 'Psychological',
 'Romance',
 'Samurai',
 'School',
 'Sci-Fi',
 'Seinen',
 'Shoujo',
 'Shoujo Ai',
 'Shounen',
 'Shounen Ai',
 'Slice of Life',
 'Space',
 'Sports',
 'Super Power',
 'Supernatural',
 'Thriller',
 'Vampire'}

In [7]:
# Create columns for each genre with 1 or 0
for genre in unique_genres:
    df[genre] = df['genre'].apply(lambda x: 1 if genre in x else 0)

gender_values = ['Male', 'Female', 'Non-Binary']
for gender in gender_values:
    df[f'gender_{gender}'] = df['gender'].apply(lambda x: 1 if x == gender else 0)

df.drop('scored_by', axis=1, inplace = True)
df.drop('source', axis=1, inplace = True)
df.drop('rank', axis=1, inplace = True)
df_unique = df.drop_duplicates(subset='anime_id')

anime_dict = dict(zip(df_unique['anime_id'], df_unique['title']))

anime_dict

df_unique_user = df.drop_duplicates(subset='username')

user_dict = dict(zip(df_unique_user['user_id'], df_unique_user['username']))

user_dict

{2255153: 'karthiga',
 1897606: 'RedvelvetDaisuki',
 37326: 'Damonashu',
 228342: 'bskai',
 61677: 'Slimak',
 2485327: 'MistButterfly',
 144049: 'kioniel',
 1: 'Xinil',
 18867: 'ihasabucket',
 340873: 'xTheFallenx',
 14658: 'L-LawlietDN',
 2637159: 'Lithuelle',
 19539: 'scootarski',
 82964: 'Akihara',
 1933206: 'Tomoki-sama',
 158248: 'Sakurei',
 183036: 'Perfection_Freak',
 167812: 'LyannaStark',
 4511507: 'magedgamed',
 4420327: 'ShinyShinigami',
 5285183: 'SenpajBiju',
 5158638: 'WhatsUpWitches',
 317746: 'Wizzaroo',
 109554: 'Seishuku',
 54286: 'PrinceRen',
 6658717: 'ChicoDj_',
 4437072: 'brunomp',
 1425283: 'Mukkashi',
 21970: 'Rukawa',
 1397853: 'Best-of-Anime',
 16514: 'jukugo',
 406417: 'AgehaChan',
 6923415: '-Kenjiro',
 43988: 'Froschfan',
 1084625: 'Tsundora',
 2171033: 'FlashFrozen',
 945191: 'Tujumase',
 6188814: 'The_Lordian',
 1466623: 'VoliMedjed',
 149817: 'namiSWN',
 4543429: 'Jiraiyan',
 83582: 'IoriYagami',
 14013: 'Sirdante',
 5341448: 'lolbridget',
 82645: 'Tofs'

In [None]:
# df.to_csv('anime_dataset_encoded.csv', index=False)

In [8]:
df.drop('title', axis=1, inplace = True)
df.drop('type', axis=1, inplace = True)
df.drop('genre', axis=1, inplace = True)
df.drop('gender', axis=1, inplace = True)
df.drop('username', axis=1, inplace = True)

In [9]:
df.to_csv('anime_dataset_training.csv', index=False)

In [None]:
pd.set_option('display.max_columns', None) 
df.head()

Unnamed: 0,anime_id,my_score,user_id,score,popularity,Dementia,Vampire,Shoujo,Romance,Music,Game,Comedy,Ecchi,Josei,Sci-Fi,Shounen Ai,School,Sports,Shounen,Harem,Samurai,Kids,Fantasy,Cars,Magic,Mystery,Historical,Super Power,Police,Slice of Life,Action,Parody,Seinen,Space,Shoujo Ai,Horror,Mecha,Supernatural,Demons,Military,Drama,Adventure,Thriller,Martial Arts,Psychological,gender_Male,gender_Female,gender_Non-Binary
0,21,9,2255153,8.54,35,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0
1,59,7,2255153,7.53,188,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
2,74,7,2255153,7.77,1291,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,120,7,2255153,7.77,222,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
4,178,7,2255153,7.26,2490,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [10]:
unique_user = df['user_id'].nunique()
unique_anime = df['anime_id'].nunique()
num_ratings = len(df)
print(f"Total Number of Unique Users: {unique_user}")
print(f"Total Number of Unique Anime Titles: {unique_anime}")
print(f"Total Number of User Ratings: {num_ratings}")

Total Number of Unique Users: 113143
Total Number of Unique Anime Titles: 7633
Total Number of User Ratings: 21124780


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Feature Selection
# Include new gender columns, score, rank, popularity, and genres
gender_columns = ['gender_Male', 'gender_Female', 'gender_Non-Binary']
genre_columns = list(unique_genres)  # Use the set of unique genres
features = gender_columns + ['score', 'popularity', 'anime_id', 'user_id'] + genre_columns
target = 'my_score'

X = df[features]
y = df[target]

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

# Recommender function
def recommend_anime(user_id, num_recommendations=5):
    # Get the user's gender columns
    user_data = df[df['user_id'] == user_id]
    if user_data.empty:
        print(f"No data found for user_id: {user_id}")
        return pd.DataFrame()
    
    # Extract gender column values
    user_gender = user_data[gender_columns].iloc[0]
    
    # Get all anime rated by user
    rated_anime = user_data['anime_id'].unique()
    
    # Get all unique anime not yet rated by user
    unrated_anime = df[~df['anime_id'].isin(rated_anime)].drop_duplicates(subset=['anime_id'])
    
    # Check if there are any unrated anime
    if unrated_anime.empty:
        print("No unrated anime available for recommendation.")
        return pd.DataFrame()
    
    # Set gender values for predictions
    X_unrated = unrated_anime[features].copy()
    for col in gender_columns:
        X_unrated[col] = user_gender[col]
    
    print(X_unrated)
    # Predict scores
    predicted_scores = model.predict(X_unrated)
    
    # Add predictions to the dataframe
    unrated_anime = unrated_anime.copy()
    unrated_anime['predicted_score'] = predicted_scores
    
    # Map anime_id to titles using the anime_dict
    unrated_anime['title'] = unrated_anime['anime_id'].map(anime_dict)
    
    # Sort by predicted_score in descending order and select top N
    recommendations = unrated_anime.sort_values(
        by='predicted_score', ascending=False
    ).head(num_recommendations)
    
    return recommendations[['title', 'predicted_score']]

# Example recommendation
user_id_example = 2255153
recommendations = recommend_anime(user_id_example)
print("Recommended Anime:")
print(recommendations)


Mean Squared Error: 2.43
          gender_Male  gender_Female  gender_Non-Binary  score  popularity  \
80                  0              1                  0   8.81          38   
82                  0              1                  0   8.14         500   
83                  0              1                  0   8.69         152   
84                  0              1                  0   7.88          10   
87                  0              1                  0   7.78         661   
...               ...            ...                ...    ...         ...   
2732391             0              1                  0   6.68        7262   
3045235             0              1                  0   6.77        8327   
3178736             0              1                  0   7.41        8479   
4316838             0              1                  0   7.46        8673   
16725137            0              1                  0   7.27        9237   

          anime_id  user_id  Slice of 

In [28]:
print(df.head().to_string())

   anime_id  my_score  user_id  score  popularity  Horror  Kids  Police  Historical  Magic  Martial Arts  Super Power  Drama  Sci-Fi  Sports  Psychological  Space  Music  Mecha  Seinen  Thriller  Demons  Vampire  Shoujo Ai  Comedy  Ecchi  Shounen Ai  Cars  Shoujo  Harem  Supernatural  Shounen  Game  Action  Parody  Military  Romance  Fantasy  Adventure  Samurai  Dementia  School  Josei  Mystery  Slice of Life  gender_Male  gender_Female  gender_Non-Binary
0        21         9  2255153   8.54          35       0     0       0           0      0             0            1      1       0       0              0      0      0      0       0         0       0        0          0       1      0           0     0       0      0             0        1     0       1       0         0        0        1          1        0         0       0      0        0              0            0              1                  0
1        59         7  2255153   7.53         188       0     0       0         

In [1]:

import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, GridSearchCV, KFold

df = pd.read_csv('data/anime_dataset_training.csv')

# # 2. Data Preparation
df = df.dropna(subset=['user_id', 'anime_id', 'score'])

# 3. Define the Rating Scale
reader = Reader(rating_scale=(1, 10))

# 4. Load Data into Surprise
data = Dataset.load_from_df(df[['user_id', 'anime_id', 'my_score']], reader)

In [None]:
# param_grid = {
#     "n_factors": [10, 50, 100],
#     "n_epochs": [10, 20, 30],
#     "lr_all": [0.005, 0.01, 0.1],
#     "reg_all": [0.02, 0.1, 0.5, 1],
# }

param_grid = {
    "n_factors": [100],
    "n_epochs": [10, 20],
    "lr_all": [0.005],
    "reg_all": [0.02],
}

grid_search = GridSearchCV(SVD, param_grid, measures=["mse"], cv=5)
grid_search.fit(data)

best_params = grid_search.best_params["mse"]
best_score = grid_search.best_score["mse"]
print(f"Best MSE: {best_score}")
print(f"Best Parameters: {best_params}")

best_model = grid_search.best_estimator["mse"]

In [2]:
# 5. Train-Test Split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# 6. Initialize and Train the SVD Model
svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14b5d8e10>

In [18]:
# 7. Evaluate the Model
predictions = svd.test(testset)
mse = accuracy.mse(predictions)
mae = accuracy.mae(predictions)
print(f"MSE: {mse}")
print(f"MAE: {mae}")

# 8. Function to Get Top-N Recommendations
def get_top_n_recommendations(svd_model, user_id, df, n=10):
    all_anime_ids = df['anime_id'].unique()
    rated_anime_ids = df[df['user_id'] == user_id]['anime_id'].tolist()
    unrated_anime_ids = [anime for anime in all_anime_ids if anime not in rated_anime_ids]
    predictions = [svd_model.predict(user_id, anime_id) for anime_id in unrated_anime_ids]
    predictions.sort(key=lambda x: x.est, reverse=True)
    top_n = predictions[:n]
    recommendations = [(pred.iid, pred.est) for pred in top_n]
    return recommendations


MSE: 1.3960
MAE:  0.8699
MSE: 1.3959918370522095
MAE: 0.8698947386038589

Top 5 recommendations for user 2255153:
Anime ID: 32281, Predicted Rating: 9.21
Anime ID: 5114, Predicted Rating: 9.12
Anime ID: 9253, Predicted Rating: 9.09
Anime ID: 11061, Predicted Rating: 8.97
Anime ID: 28851, Predicted Rating: 8.97


In [26]:
# 9. Generate Recommendations for a User
user_id = '2255153'
top_n = 5
recommendations = get_top_n_recommendations(svd, user_id, df, n=top_n)

print(f"\nTop {top_n} recommendations for user {user_dict[int(user_id)]} (id: {user_id}):")
for anime, rating in recommendations:
    print(f"Anime: {anime_dict[int(anime)]}, Predicted Rating: {rating:.2f}")


Top 5 recommendations for user karthiga (id: 2255153):
Anime: Kimi no Na wa., Predicted Rating: 9.21
Anime: Fullmetal Alchemist: Brotherhood, Predicted Rating: 9.12
Anime: Steins;Gate, Predicted Rating: 9.09
Anime: Hunter x Hunter (2011), Predicted Rating: 8.97
Anime: Koe no Katachi, Predicted Rating: 8.97


In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv('anime_tv_dataset_training.csv')

# Define feature columns
genres = ['Horror', 'Kids', 'Police', 'Historical', 'Magic', 
          'Martial Arts', 'Super Power', 'Drama', 'Sci-Fi', 
          'Sports', 'Psychological', 'Space', 'Music', 'Mecha', 
          'Seinen', 'Thriller', 'Demons', 'Vampire', 'Shoujo Ai', 
          'Comedy', 'Ecchi', 'Shounen Ai', 'Cars', 'Shoujo', 
          'Harem', 'Supernatural', 'Shounen', 'Game', 'Action', 
          'Parody', 'Military', 'Romance', 'Fantasy', 'Adventure', 
          'Samurai', 'Dementia', 'School', 'Josei', 'Mystery', 
          'Slice of Life']

# Handle missing values
numerical_features = ['score', 'popularity']
for feature in numerical_features:
    df[feature].fillna(df[feature].median(), inplace=True)

categorical_features = ['anime_id', 'user_id'] + genres
for feature in categorical_features:
    df[feature].fillna(df[feature].mode()[0], inplace=True)

# Encode 'anime_id' and 'user_id'
anime_encoder = LabelEncoder()
user_encoder = LabelEncoder()

df['anime_id_enc'] = anime_encoder.fit_transform(df['anime_id'])
df['user_id_enc'] = user_encoder.fit_transform(df['user_id'])



feature_columns = ['anime_id_enc', 'user_id_enc', 'score', 'popularity'] + genres + ['gender_Male', 'gender_Female', 'gender_Non-Binary']
X = df[feature_columns]
y = df['my_score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Define the model
# Input layers
anime_input = Input(shape=(1,), name='anime_input')
anime_embedding = Embedding(input_dim=df['anime_id_enc'].nunique(), output_dim=50, name='anime_embedding')(anime_input)
anime_vec = Flatten()(anime_embedding)

user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=df['user_id_enc'].nunique(), output_dim=50, name='user_embedding')(user_input)
user_vec = Flatten()(user_embedding)

other_features = Input(shape=(len(feature_columns) - 2,), name='other_features')

# Concatenate all features
concat = Concatenate()([anime_vec, user_vec, other_features])

# Dense layers
dense = Dense(128, activation='relu')(concat)
dense = Dropout(0.5)(dense)
dense = Dense(64, activation='relu')(dense)
dense = Dropout(0.3)(dense)
dense = Dense(32, activation='relu')(dense)

# Output layer
output = Dense(1, activation='linear', name='output')(dense)

# Build and compile the model
model = Model(inputs=[anime_input, user_input, other_features], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

# Prepare input data
train_inputs = {
    'anime_input': X_train['anime_id_enc'].values,
    'user_input': X_train['user_id_enc'].values,
    'other_features': X_train.drop(['anime_id_enc', 'user_id_enc'], axis=1).values
}

test_inputs = {
    'anime_input': X_test['anime_id_enc'].values,
    'user_input': X_test['user_id_enc'].values,
    'other_features': X_test.drop(['anime_id_enc', 'user_id_enc'], axis=1).values
}

# Train the model
history = model.fit(
    train_inputs,
    y_train,
    epochs=20,
    batch_size=64,
    validation_data=(test_inputs, y_test),
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    ]
)

# Evaluate the model
loss, mae = model.evaluate(test_inputs, y_test)
print(f"Test MAE: {mae}")

# Make predictions
predictions = model.predict({
    'anime_input': X_test['anime_id_enc'].values[:5],
    'user_input': X_test['user_id_enc'].values[:5],
    'other_features': X_test.drop(['anime_id_enc', 'user_id_enc'], axis=1).values[:5]
})

print("Predicted Ratings:", predictions.flatten())
print("Actual Ratings:", y_test.values[:5])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feature].fillna(df[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feature].fillna(df[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

Epoch 1/50
[1m170762/264060[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m53:07[0m 34ms/step - loss: 1.9965 - mae: 1.0607

KeyboardInterrupt: 

In [1]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


PyTorch version: 2.5.1+cu124
CUDA available: True


In [2]:
if torch.cuda.is_available():
    print("Device count:", torch.cuda.device_count())
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA is not available.")


Device count: 1
Current device: 0
Device name: NVIDIA GeForce GTX 1660 SUPER


In [3]:
if torch.cuda.is_available():
    # Create a tensor and move it to the GPU
    device = torch.device("cuda")  # or "cuda:0"
    tensor = torch.rand(3, 3).to(device)
    print("Tensor on GPU:", tensor)

    # Perform a GPU computation
    result = tensor + tensor
    print("Computation result on GPU:", result)
else:
    print("CUDA is not available, computation will not run on GPU.")


Tensor on GPU: tensor([[0.0804, 0.1408, 0.1979],
        [0.8817, 0.1324, 0.3284],
        [0.5931, 0.5009, 0.7541]], device='cuda:0')
Computation result on GPU: tensor([[0.1608, 0.2815, 0.3957],
        [1.7633, 0.2648, 0.6568],
        [1.1862, 1.0018, 1.5081]], device='cuda:0')


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Check if CUDA is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# 1. Data Preprocessing

# Load your dataset
df = pd.read_csv('anime_tv_dataset_training.csv')

# Define feature columns
genres = ['Horror', 'Kids', 'Police', 'Historical', 'Magic', 
          'Martial Arts', 'Super Power', 'Drama', 'Sci-Fi', 
          'Sports', 'Psychological', 'Space', 'Music', 'Mecha', 
          'Seinen', 'Thriller', 'Demons', 'Vampire', 'Shoujo Ai', 
          'Comedy', 'Ecchi', 'Shounen Ai', 'Cars', 'Shoujo', 
          'Harem', 'Supernatural', 'Shounen', 'Game', 'Action', 
          'Parody', 'Military', 'Romance', 'Fantasy', 'Adventure', 
          'Samurai', 'Dementia', 'School', 'Josei', 'Mystery', 
          'Slice of Life']

# Handle missing values
numerical_features = ['score', 'popularity']
for feature in numerical_features:
    df[feature].fillna(df[feature].median(), inplace=True)

categorical_features = ['anime_id', 'user_id'] + genres + ['gender_Male', 'gender_Female', 'gender_Non-Binary']
for feature in categorical_features:
    df[feature].fillna(df[feature].mode()[0], inplace=True)

# Encode 'anime_id' and 'user_id'
anime_encoder = LabelEncoder()
user_encoder = LabelEncoder()

df['anime_id_enc'] = anime_encoder.fit_transform(df['anime_id'])
df['user_id_enc'] = user_encoder.fit_transform(df['user_id'])

# Define feature columns
feature_columns = ['anime_id_enc', 'user_id_enc', 'score', 'popularity'] + genres + ['gender_Male', 'gender_Female', 'gender_Non-Binary']
X = df[feature_columns]
y = df['my_score'].values.astype(np.float32)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# 2. Create PyTorch Dataset

class AnimeDataset(Dataset):
    def __init__(self, X, y):
        self.anime_ids = torch.tensor(X['anime_id_enc'].values, dtype=torch.long)
        self.user_ids = torch.tensor(X['user_id_enc'].values, dtype=torch.long)
        self.other_features = torch.tensor(X.drop(['anime_id_enc', 'user_id_enc'], axis=1).values, dtype=torch.float32)
        self.targets = torch.tensor(y, dtype=torch.float32).unsqueeze(1)  # Make it (N, 1)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return {
            'anime_input': self.anime_ids[idx],
            'user_input': self.user_ids[idx],
            'other_features': self.other_features[idx],
            'target': self.targets[idx]
        }

# Create datasets
train_dataset = AnimeDataset(X_train, y_train)
test_dataset = AnimeDataset(X_test, y_test)

# Create dataloaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 3. Define the Model

class AnimeRatingModel(nn.Module):
    def __init__(self, 
                 num_anime, 
                 num_users, 
                 other_features_dim, 
                 embedding_dim=50):
        super(AnimeRatingModel, self).__init__()
        # Embedding layers
        self.anime_embedding = nn.Embedding(num_anime, embedding_dim)
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        
        # Fully connected layers
        input_dim = embedding_dim * 2 + other_features_dim
        self.fc1 = nn.Linear(input_dim, 128)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        
        self.relu = nn.ReLU()
        
    def forward(self, anime_input, user_input, other_features):
        anime_vec = self.anime_embedding(anime_input)
        user_vec = self.user_embedding(user_input)
        x = torch.cat([anime_vec, user_vec, other_features], dim=1)
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.relu(self.fc3(x))
        out = self.output(x)
        return out

# Initialize the model
num_anime = df['anime_id_enc'].nunique()
num_users = df['user_id_enc'].nunique()
other_features_dim = len(feature_columns) - 2  # Exclude 'anime_id_enc' and 'user_id_enc'

model = AnimeRatingModel(num_anime, num_users, other_features_dim).to(device)
print(model)

# 4. Define Loss and Optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 5. Training Loop with Early Stopping

import time

def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=20, patience=5):
    best_val_loss = np.inf
    patience_counter = 0
    history = {'train_loss': [], 'val_loss': []}
    
    for epoch in range(1, epochs + 1):
        model.train()
        train_losses = []
        for batch in train_loader:
            optimizer.zero_grad()
            anime_input = batch['anime_input'].to(device)
            user_input = batch['user_input'].to(device)
            other_features = batch['other_features'].to(device)
            targets = batch['target'].to(device)
            
            outputs = model(anime_input, user_input, other_features)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        
        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for batch in test_loader:
                anime_input = batch['anime_input'].to(device)
                user_input = batch['user_input'].to(device)
                other_features = batch['other_features'].to(device)
                targets = batch['target'].to(device)
                
                outputs = model(anime_input, user_input, other_features)
                loss = criterion(outputs, targets)
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(avg_val_loss)
        
        print(f'Epoch {epoch}/{epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}')
        
        # Check for improvement
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break
    
    # Load the best model
    model.load_state_dict(torch.load('best_model.pth'))
    return history

# Train the model
start_time = time.time()
history = train_model(model, train_loader, test_loader, criterion, optimizer, epochs=20, patience=5)
end_time = time.time()
print(f"Training completed in {end_time - start_time:.2f} seconds.")

# 6. Evaluate the Model

def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    total_mae = 0
    total_samples = 0
    with torch.no_grad():
        for batch in test_loader:
            anime_input = batch['anime_input'].to(device)
            user_input = batch['user_input'].to(device)
            other_features = batch['other_features'].to(device)
            targets = batch['target'].to(device)
            
            outputs = model(anime_input, user_input, other_features)
            loss = criterion(outputs, targets)
            total_loss += loss.item() * targets.size(0)
            total_mae += torch.abs(outputs - targets).sum().item()
            total_samples += targets.size(0)
    
    avg_loss = total_loss / total_samples
    avg_mae = total_mae / total_samples
    print(f"Test MSE: {avg_loss:.4f}, Test MAE: {avg_mae:.4f}")
    return avg_loss, avg_mae

# Evaluate
test_mse, test_mae = evaluate_model(model, test_loader, criterion)

# 7. Make Predictions

def make_predictions(model, X, num_samples=5):
    model.eval()
    with torch.no_grad():
        anime_ids = torch.tensor(X['anime_id_enc'].values[:num_samples], dtype=torch.long).to(device)
        user_ids = torch.tensor(X['user_id_enc'].values[:num_samples], dtype=torch.long).to(device)
        other_features = torch.tensor(X.drop(['anime_id_enc', 'user_id_enc'], axis=1).values[:num_samples], dtype=torch.float32).to(device)
        
        outputs = model(anime_ids, user_ids, other_features)
        predictions = outputs.cpu().numpy().flatten()
    return predictions

# Get first 5 predictions
predictions = make_predictions(model, X_test, num_samples=5)
actual = y_test[:5]

print("Predicted Ratings:", predictions)
print("Actual Ratings:", actual)

# 8. Save the Model

torch.save(model.state_dict(), 'anime_rating_model.pth')
print("Model saved to 'anime_rating_model.pth'")


Using device: cuda


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feature].fillna(df[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feature].fillna(df[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

AnimeRatingModel(
  (anime_embedding): Embedding(7633, 50)
  (user_embedding): Embedding(113143, 50)
  (fc1): Linear(in_features=145, out_features=128, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (dropout2): Dropout(p=0.3, inplace=False)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
)


In [5]:
print('Wait')

Wait
