In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv("anime.csv")
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
df.shape

(12294, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [6]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [7]:
# Remove extra spaces from column names
df.columns = df.columns.str.strip()

df.replace('Unknown', np.nan, inplace=True)


In [8]:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['members'] = pd.to_numeric(df['members'], errors='coerce')

In [9]:
df['rating'] = df['rating'].fillna(df['rating'].median())
df['episodes'] = df['episodes'].fillna(df['episodes'].median())
df['members'] = df['members'].fillna(df['members'].median())

In [10]:
features_df = df[['genre', 'rating', 'members', 'episodes']]
features_df.head()

Unnamed: 0,genre,rating,members,episodes
0,"Drama, Romance, School, Supernatural",9.37,200630,1.0
1,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26,793665,64.0
2,"Action, Comedy, Historical, Parody, Samurai, S...",9.25,114262,51.0
3,"Sci-Fi, Thriller",9.17,673572,24.0
4,"Action, Comedy, Historical, Parody, Samurai, S...",9.16,151266,51.0


In [11]:
genre_dummies = features_df['genre'].str.get_dummies(sep=',')
genre_dummies.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
num_features = features_df[['rating', 'members', 'episodes']]

final_features = pd.concat([genre_dummies, num_features], axis=1)
final_features.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9.37,200630,1.0
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,9.26,793665,64.0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9.25,114262,51.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9.17,673572,24.0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9.16,151266,51.0


In [13]:
scaler = MinMaxScaler()

final_features[num_features.columns] = scaler.fit_transform(
    final_features[num_features.columns]
)

In [14]:
cosine_sim = cosine_similarity(final_features)
cosine_sim.shape

(12294, 12294)

In [15]:
anime_index = pd.Series(df.index, index=df['name']).drop_duplicates()
anime_index.head()

name
Kimi no Na wa.                      0
Fullmetal Alchemist: Brotherhood    1
Gintama°                            2
Steins;Gate                         3
Gintama&#039;                       4
dtype: int64

In [16]:
def recommend_anime(anime_name, top_n=5, similarity_threshold=0.3):
    if anime_name not in anime_index:
        return "Anime not found in dataset."
    
    idx = anime_index[anime_name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = [
        (i, score) for i, score in sim_scores
        if score >= similarity_threshold and i != idx
    ]
    
    top_indices = [i for i, _ in sim_scores[:top_n]]
    
    return df.loc[top_indices, ['name', 'genre', 'rating']]

In [17]:
recommend_anime("Naruto", top_n=5)

Unnamed: 0,name,genre,rating
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",7.53
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.5
486,Boruto: Naruto the Movie,"Action, Comedy, Martial Arts, Shounen, Super P...",8.03
1343,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P...",7.58


In [18]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.shape, test_df.shape

((9835, 7), (2459, 7))

In [19]:
train_features = final_features.loc[train_df.index]

from sklearn.metrics.pairwise import cosine_similarity
cosine_sim_train = cosine_similarity(train_features)

In [20]:
anime_index_train = pd.Series(
    train_df.index, index=train_df['name']
).drop_duplicates()

In [21]:
def recommend_anime_train(anime_name, top_n=5):
    if anime_name not in anime_index_train:
        return []
    
    idx = anime_index_train[anime_name]
    sim_scores = list(enumerate(cosine_sim_train[list(train_df.index).index(idx)]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [train_df.index[i] for i, _ in sim_scores[1:top_n+1]]
    
    return train_df.loc[top_indices, ['name', 'genre']]

In [22]:
def is_relevant(target_genre, recommended_genre):
    target_set = set(target_genre.split(','))
    rec_set = set(recommended_genre.split(','))
    return len(target_set.intersection(rec_set)) > 0

In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_anime(anime_name, top_n=5):
    recs = recommend_anime_train(anime_name, top_n)
    
    if len(recs) == 0:
        return None
    
    target_genre = train_df[train_df['name'] == anime_name]['genre'].values[0]
    
    y_true = []
    y_pred = []
    
    for _, row in recs.iterrows():
        y_true.append(1)  # recommended item
        y_pred.append(
            1 if is_relevant(target_genre, row['genre']) else 0
        )
    
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    return precision, recall, f1

In [24]:
results = []

sample_anime = train_df['name'].sample(10, random_state=42)

for anime in sample_anime:
    metrics = evaluate_anime(anime)
    if metrics:
        results.append(metrics)

results_df = pd.DataFrame(
    results, columns=['Precision', 'Recall', 'F1-score']
)

results_df.mean()

Precision    1.0
Recall       1.0
F1-score     1.0
dtype: float64

In [25]:
results_df

Unnamed: 0,Precision,Recall,F1-score
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0
3,1.0,1.0,1.0
4,1.0,1.0,1.0
5,1.0,1.0,1.0
6,1.0,1.0,1.0
7,1.0,1.0,1.0
8,1.0,1.0,1.0
9,1.0,1.0,1.0


## Interview Questions

1. Can you explain the difference between user-based and item-based collaborative filtering?

User-Based Collaborative Filtering:
Recommends items by finding users with similar preferences and suggesting what they liked.

Item-Based Collaborative Filtering:
Recommends items similar to what a user has already liked, based on item similarity.

2. What is collaborative filtering, and how does it work?

Collaborative Filtering:
A recommendation technique that predicts user preferences based on past interactions of users and items.

How it works (in short):
It finds patterns in user–item interactions and recommends items liked by similar users or similar items.