In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv("/content/anime.csv")
print("Initial dataset shape:", df.shape)
print(df.head())

Initial dataset shape: (12294, 7)
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [8]:
df = df.dropna(subset=['genre']).reset_index(drop=True)# Drop rows where genre is missing
print("After dropping missing genres:", df.shape)


After dropping missing genres: (12232, 7)


In [24]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
df = df.dropna(subset=['genre', 'type', 'rating']).reset_index(drop=True)

# TF-IDF for genre
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(", "))
genre_tfidf = vectorizer.fit_transform(df['genre'])

# One-hot encode 'type'
ohe = OneHotEncoder()
type_encoded = ohe.fit_transform(df[['type']]).toarray()

# Normalize 'rating'
scaler = MinMaxScaler()
rating_scaled = scaler.fit_transform(df[['rating']])

# Combine all features
import numpy as np
features_combined = np.hstack([genre_tfidf.toarray(), type_encoded, rating_scaled])

# Compute cosine similarity
cosine_sim = cosine_similarity(features_combined, features_combined)

print("\nFeatures selected: 'genre', 'type', 'rating'")
print("Categorical 'type' encoded with OneHot, 'rating' normalized.")
print("Combined into single feature matrix for similarity.")


Features selected: 'genre', 'type', 'rating'
Categorical 'type' encoded with OneHot, 'rating' normalized.
Combined into single feature matrix for similarity.


In [25]:
def recommend_anime(title, top_n=5):
    if title not in df['name'].values:
        return []
    idx = df[df['name'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Skip itself
    anime_indices = [i[0] for i in sim_scores]
    return df['name'].iloc[anime_indices].tolist()

# Example usage
print("\nRecommendations for Naruto:")
print(recommend_anime("Naruto", top_n=5))



Recommendations for Naruto:
['Naruto: Shippuuden', 'Rekka no Honoo', 'Dragon Ball Kai (2014)', 'Dragon Ball Z', 'Dragon Ball Kai']


In [28]:
def recommend_anime(title, threshold=0.3):
    if title not in df['name'].values:
        return []
    idx = df[df['name'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [s for s in sim_scores if s[0] != idx and s[1] >= threshold]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    anime_indices = [i[0] for i in sim_scores]
    return df['name'].iloc[anime_indices].tolist()

# Example usage with different thresholds
print("\nRecommendations for Naruto with threshold 0.3:")
print(recommend_anime("Naruto", threshold=0.3))

print("\nRecommendations for Naruto with threshold 0.5:")
print(recommend_anime("Naruto", threshold=0.5))

print("\nRecommendations for Naruto with threshold 0.7:")
print(recommend_anime("Naruto", threshold=0.7))



Recommendations for Naruto with threshold 0.3:
['Naruto: Shippuuden', 'Rekka no Honoo', 'Dragon Ball Kai (2014)', 'Dragon Ball Z', 'Dragon Ball Kai', 'Dragon Ball Super', 'Kurokami The Animation', 'Project ARMS', 'Wolverine', 'Dragon Ball', 'Medaka Box Abnormal', 'Medaka Box', 'Tenjou Tenge', 'Hokuto no Ken 2', 'Katekyo Hitman Reborn!', 'Virtua Fighter', 'Kenyuu Densetsu Yaiba', 'Tatakae!! Ramenman', 'Project ARMS: The 2nd Chapter', 'Shijou Saikyou no Deshi Kenichi', 'Sakigake!! Otokojuku', 'Ben-To', 'Kakutou Bijin Wulong: Rebirth', 'Taboo Tattoo', 'Mushibugyou', 'Ikkitousen: Dragon Destiny', 'Ikkitousen: Great Guardians', 'Ikkitousen: Xtreme Xecutor', 'Big Order (TV)', 'Boku no Hero Academia', 'Shadow Skill: Eigi', 'Yuusha Shirei Dagwon', 'Bleach', 'Muteki Kanban Musume', 'Hunter x Hunter (2011)', 'Hunter x Hunter', 'Sexy Commando Gaiden: Sugoiyo!! Masaru-san', 'Hokuto no Ken', 'Sasuke', 'Nano Invaders', 'Pyun Pyun Maru', 'Grappler Baki: Saidai Tournament-hen', 'Kinnikuman II Sei: Ul

In [30]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("Training set size:", train_df.shape)
print("Testing set size:", test_df.shape)

Training set size: (9613, 7)
Testing set size: (2404, 7)


In [32]:
true_positives = 0
possible_positives = 0
predicted_positives = 0

for _, row in test_df.iterrows():
    title = row['name']
    if title not in train_df['name'].values:
        continue
    true_genre = set(row['genre'].split(", "))
    recommendations = recommend_anime(title, threshold=0.3)
    for rec in recommendations:
        rec_genre = set(train_df[train_df['name'] == rec]['genre'].values[0].split(", "))
        if true_genre.intersection(rec_genre):
            true_positives += 1
    possible_positives += len(true_genre)
    predicted_positives += len(recommendations)

precision = true_positives / predicted_positives if predicted_positives else 0
recall = true_positives / possible_positives if possible_positives else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0

print("\nEvaluation Metrics:")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")



Evaluation Metrics:
Precision: 0.00
Recall: 0.00
F1-score: 0.00


Performance Analysis:


1. This split allows fair evaluation of the recommender system.
2. Precision and recall should be monitored: genre overlap only provides basic similarity.
3. To improve: combine more features like user ratings, reviews, or embeddings.
4. Experiment with different thresholds and similarity measures.
5. Consider hybrid or collaborative filtering for better personalization.