In [35]:
# importing libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
# Loading dataset
anime = pd.read_csv('anime.csv')
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [9]:
# here we are selecting only genre as feature becausewe are creating a recommendation system based on genre similarity.
# Filling missing genres
anime['genre'] = anime['genre'].fillna('')
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [13]:
# Splitting genres into lists  , we are doing this because In the dataset, the genre column is a comma-separated string.
anime['genre_list'] = anime['genre'].apply(lambda x: [g.strip().lower() for g in x.split(',') if g.strip()])
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,genre_list
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,"[drama, romance, school, supernatural]"
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,"[action, adventure, drama, fantasy, magic, mil..."
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,"[action, comedy, historical, parody, samurai, ..."
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,"[sci-fi, thriller]"
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,"[action, comedy, historical, parody, samurai, ..."
...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211,[hentai]
12290,5543,Under World,Hentai,OVA,1,4.28,183,[hentai]
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219,[hentai]
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175,[hentai]


In [17]:
# Extracting unique genres
unique_genres = set(g for sublist in anime['genre_list'] for g in sublist)
unique_genres = sorted(unique_genres)  # Sorting 
unique_genres

['action',
 'adventure',
 'cars',
 'comedy',
 'dementia',
 'demons',
 'drama',
 'ecchi',
 'fantasy',
 'game',
 'harem',
 'hentai',
 'historical',
 'horror',
 'josei',
 'kids',
 'magic',
 'martial arts',
 'mecha',
 'military',
 'music',
 'mystery',
 'parody',
 'police',
 'psychological',
 'romance',
 'samurai',
 'school',
 'sci-fi',
 'seinen',
 'shoujo',
 'shoujo ai',
 'shounen',
 'shounen ai',
 'slice of life',
 'space',
 'sports',
 'super power',
 'supernatural',
 'thriller',
 'vampire',
 'yaoi',
 'yuri']

In [19]:
# Mapping genre to index
genre_to_idx = {genre: idx for idx, genre in enumerate(unique_genres)}

In [21]:
# Creating multi-hot encoding manually
genre_matrix = np.zeros((len(anime), len(unique_genres)), dtype=int)

for i, genres in enumerate(anime['genre_list']):
    for genre in genres:
        if genre in genre_to_idx:
            genre_matrix[i, genre_to_idx[genre]] = 1

In [23]:
# Computing cosine similarity between all anime based on genre 
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

In [25]:
# creating a mapping from each anime_id to its corresponding row number (index) in the DataFrame.
anime_id_to_index = pd.Series(anime.index, index=anime['anime_id']).drop_duplicates()

In [27]:
# defining a Function to get recommendations using cosine similarity
def get_similar_anime(anime_id, top_n=5, similarity_threshold=0.1):
    if anime_id not in anime_id_to_index:
        print("Anime ID not found.")
        return pd.DataFrame()
    
    idx = anime_id_to_index[anime_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [(i, score) for i, score in sim_scores if i != idx and score >= similarity_threshold]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:top_n]
    
    similar_indices = [i for i, _ in sim_scores]
    results = anime.loc[similar_indices, ['anime_id', 'name', 'genre']].copy()
    results['similarity'] = [score for _, score in sim_scores]
    return results


In [29]:
# experienting with different threshold values for similarity scores
anime_id_input = int(input("Enter Anime ID: "))  # user input
thresholds = [0.0, 0.1, 0.2, 0.3, 0.4]

for t in thresholds:
    print(f"\n=== Similar Anime with Threshold ≥ {t:.1f} ===")
    recs = get_similar_anime(anime_id_input, top_n=5, similarity_threshold=t)
    print(f"Found {len(recs)} recommendations.")
    if not recs.empty:
        print(recs[['anime_id', 'name', 'genre', 'similarity']].to_string(index=False))
    else:
        print("No similar anime found at this threshold.")
    print("-" * 60)

Enter Anime ID:  5114



=== Similar Anime with Threshold ≥ 0.0 ===
Found 5 recommendations.
 anime_id                                          name                                                               genre  similarity
      121                           Fullmetal Alchemist Action, Adventure, Comedy, Drama, Fantasy, Magic, Military, Shounen    0.935414
     9135 Fullmetal Alchemist: The Sacred Star of Milos Action, Adventure, Comedy, Drama, Fantasy, Magic, Military, Shounen    0.935414
     6421     Fullmetal Alchemist: Brotherhood Specials                 Adventure, Drama, Fantasy, Magic, Military, Shounen    0.925820
    18321                               Kkomaeosa Ttori       Action, Adventure, Drama, Fantasy, Historical, Magic, Shounen    0.857143
    18115                    Magi: The Kingdom of Magic                          Action, Adventure, Fantasy, Magic, Shounen    0.845154
------------------------------------------------------------

=== Similar Anime with Threshold ≥ 0.1 ===
Found 5 re

In [31]:
# finally deploying the model with 0.3 similarity threshold (optimal)
similarity_threshold = 0.3  # You can change this threshold
model = get_similar_anime(anime_id_input, top_n=5, similarity_threshold=similarity_threshold)
print(f"\nRecommended anime for Anime ID {anime_id_input} (Threshold ≥ {similarity_threshold}):")
print(model[['anime_id', 'name', 'genre', 'similarity']].to_string(index=False))


Recommended anime for Anime ID 5114 (Threshold ≥ 0.3):
 anime_id                                          name                                                               genre  similarity
      121                           Fullmetal Alchemist Action, Adventure, Comedy, Drama, Fantasy, Magic, Military, Shounen    0.935414
     9135 Fullmetal Alchemist: The Sacred Star of Milos Action, Adventure, Comedy, Drama, Fantasy, Magic, Military, Shounen    0.935414
     6421     Fullmetal Alchemist: Brotherhood Specials                 Adventure, Drama, Fantasy, Magic, Military, Shounen    0.925820
    18321                               Kkomaeosa Ttori       Action, Adventure, Drama, Fantasy, Historical, Magic, Shounen    0.857143
    18115                    Magi: The Kingdom of Magic                          Action, Adventure, Fantasy, Magic, Shounen    0.845154


In [37]:
# Splitting the dataset into training and testing sets
train_df, test_df = train_test_split(anime, test_size=0.2, random_state=42)
print(f"\nTrain set size: {len(train_df)} anime")
print(f"Test set size: {len(test_df)} anime")


Train set size: 9835 anime
Test set size: 2459 anime


In [39]:
# evaluation 
# we cannot evaluate the model because we dont have the true labels in the dataset ie (recommedations column in the dataset) to evaluate .
# In The dataset  anime.csv) it has anime info and genres, but no user behavior data like who watched or rated what. Without that, we don’t have real ground truth( true labels) to compute precision, recall, or F1 score. 

In [None]:
# interview questions

# 1. Can you explain the difference between user-based and item-based collaborative filtering?
#User-Based Collaborative Filtering: This method suggests products to a user by considering the preferences of users who are similar to them.  It calculates siilarity between users,  using measures like  cosine similarity etc
#tem-Based Collaborative Filtering:This method suggests items based on the similarity between items rather than users. It calculates the similarity between items, typically based on user interactions or ratings. using metrics like cosine similarity etc.


#2. What is collaborative filtering, and how does it work?
# Collaborative Filtering is a recommendation technique that recommend things (like movies, songs, or anime) by looking at what other people with similar tastes liked.