# Recommendation System

## Data Preprocessing

In [104]:
import pandas as pd
import numpy as np

In [105]:
# load csv file
df = pd.read_csv('anime.csv')
pd.set_option('display.max_rows', 10)
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [106]:
# structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


Given large dataset, removing all rows having null values.

In [107]:
# drop null
df = df.dropna()
# df

In [108]:
# episodes have Unknown values
# remove Unknowns from episodes
df = df[df['episodes'] != 'Unknown']
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [109]:
# df.duplicated().sum()
# no duplicates

# sort by anime_Id
df = df.sort_values('anime_id')
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,8.40,137636
2,6,Trigun,"Action, Comedy, Sci-Fi",TV,26,8.32,283069
3,7,Witch Hunter Robin,"Action, Drama, Magic, Mystery, Police, Superna...",TV,26,7.36,64905
4,8,Beet the Vandel Buster,"Adventure, Fantasy, Shounen, Supernatural",TV,52,7.06,9848
...,...,...,...,...,...,...,...
11825,34476,Platonic Chain: Ansatsu Jikkouchuu,"Sci-Fi, Slice of Life",Special,1,1.67,51
11826,34490,Sushi Azarashi,Comedy,TV,30,3.00,12
11827,34503,Kochinpa! Dainiki,Comedy,TV,24,3.40,75
11828,34514,Pokemon Generations,"Action, Adventure, Fantasy, Game, Kids",ONA,18,7.21,295


In [110]:
# removing unknown from episodes
df = df[df['episodes'] != 'Unknown']
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,8.40,137636
2,6,Trigun,"Action, Comedy, Sci-Fi",TV,26,8.32,283069
3,7,Witch Hunter Robin,"Action, Drama, Magic, Mystery, Police, Superna...",TV,26,7.36,64905
4,8,Beet the Vandel Buster,"Adventure, Fantasy, Shounen, Supernatural",TV,52,7.06,9848
...,...,...,...,...,...,...,...
11825,34476,Platonic Chain: Ansatsu Jikkouchuu,"Sci-Fi, Slice of Life",Special,1,1.67,51
11826,34490,Sushi Azarashi,Comedy,TV,30,3.00,12
11827,34503,Kochinpa! Dainiki,Comedy,TV,24,3.40,75
11828,34514,Pokemon Generations,"Action, Adventure, Fantasy, Game, Kids",ONA,18,7.21,295


## Feature Extraction

In [111]:
# split genre
df['genre'] = df['genre'].str.split(',')

# binary columns for each genre
genre_df = df.genre.str.join('|').str.get_dummies()
df = pd.concat([df, genre_df], axis=1)
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Adventure,Cars,Comedy,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,1,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, Sci-Fi,...",TV,26,8.82,486824,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,5,Cowboy Bebop: Tengoku no Tobira,"[Action, Drama, Mystery, Sci-Fi, Space]",Movie,1,8.40,137636,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,Trigun,"[Action, Comedy, Sci-Fi]",TV,26,8.32,283069,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,7,Witch Hunter Robin,"[Action, Drama, Magic, Mystery, Police, S...",TV,26,7.36,64905,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,Beet the Vandel Buster,"[Adventure, Fantasy, Shounen, Supernatural]",TV,52,7.06,9848,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11825,34476,Platonic Chain: Ansatsu Jikkouchuu,"[Sci-Fi, Slice of Life]",Special,1,1.67,51,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11826,34490,Sushi Azarashi,[Comedy],TV,30,3.00,12,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11827,34503,Kochinpa! Dainiki,[Comedy],TV,24,3.40,75,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11828,34514,Pokemon Generations,"[Action, Adventure, Fantasy, Game, Kids]",ONA,18,7.21,295,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
# Normalization
df['rating'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())
df['members'] = (df['members'] - df['members'].min()) / (df['members'].max() - df['members'].min())

In [113]:
# one hot encoding
df = pd.get_dummies(df, columns=['type'], dtype=int)
df

Unnamed: 0,anime_id,name,genre,episodes,rating,members,Adventure,Cars,Comedy,Dementia,...,Supernatural,Thriller,Vampire,Yaoi,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,1,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, Sci-Fi,...",26,0.858343,0.480136,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,5,Cowboy Bebop: Tengoku no Tobira,"[Action, Drama, Mystery, Sci-Fi, Space]",1,0.807923,0.135737,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,6,Trigun,"[Action, Comedy, Sci-Fi]",26,0.798319,0.279175,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,7,Witch Hunter Robin,"[Action, Drama, Magic, Mystery, Police, S...",26,0.683073,0.064003,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,8,Beet the Vandel Buster,"[Adventure, Fantasy, Shounen, Supernatural]",52,0.647059,0.009701,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11825,34476,Platonic Chain: Ansatsu Jikkouchuu,"[Sci-Fi, Slice of Life]",1,0.000000,0.000038,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11826,34490,Sushi Azarashi,[Comedy],30,0.159664,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11827,34503,Kochinpa! Dainiki,[Comedy],24,0.207683,0.000062,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11828,34514,Pokemon Generations,"[Action, Adventure, Fantasy, Game, Kids]",18,0.665066,0.000279,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [114]:
# features
features_df = ['rating', 'members']
features_df.extend(genre_df.columns)
features_df.extend([col for col in df.columns if col.startswith('type_')])
anime_features = df[features_df]
anime_features

Unnamed: 0,rating,members,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Supernatural,Thriller,Vampire,Yaoi,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,0.858343,0.480136,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0.807923,0.135737,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.798319,0.279175,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.683073,0.064003,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.647059,0.009701,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11825,0.000000,0.000038,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11826,0.159664,0.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11827,0.207683,0.000062,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11828,0.665066,0.000279,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


## Recommendation System

In [115]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_anime(target_anime, threshold, size):
    try:
        target_index = df[df['name'] == target_anime].index[0]
    except IndexError:
        print(f"Anime '{target_anime}' not found in the dataset.")
        return []

    # Calculate cosine similarity between the target anime and all other anime
    cosine_similarities = cosine_similarity(anime_features.iloc[[target_index]], anime_features)
    similar_anime_indices = cosine_similarities.argsort()[0][::-1][1:]  # Exclude the target anime itself

    recommended_anime = []
    for index in similar_anime_indices:
        similarity_score = cosine_similarities[0][index]
        if similarity_score >= threshold:
            recommended_anime.append(
                (df['name'].iloc[index], similarity_score)
            )

    return recommended_anime[0:size]

In [120]:
target_anime = 'Death Note'
recommendations = recommend_anime(target_anime, threshold=0.8, size=10)  # Adjust threshold and size

if recommendations:
    print(f"Recommendations for {target_anime}:")
    for anime, score in recommendations:
        print(f"- {anime} (Similarity: {score:.2f})")
else:
    print(f"No recommendations found for {target_anime} above the specified threshold.")

Recommendations for Death Note:
- Higurashi no Naku Koro ni Kai (Similarity: 0.89)
- Death Note Rewrite (Similarity: 0.80)


## Evaluation

In [121]:
# splitting dataset
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [128]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations(recommendations, ground_truth, k):
    # Extract recommended anime names
    recommended_anime_names = [anime[0] for anime in recommendations]

    recommended_set = set(recommended_anime_names[:k])
    ground_truth_set = set(ground_truth)

    true_positives = len(recommended_set.intersection(ground_truth_set))

    precision = true_positives / k if k > 0 else 0.0
    recall = true_positives / len(ground_truth_set) if ground_truth_set else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1

In [None]:
k = 5  # Consider top-k recommendations
results = []

for index, row in test_data.iterrows():
    anime_name = row['name']
    recommendations = recommend_anime(anime_name, threshold=0.1, size=k)
    ground_truth = ["Fullmetal Alchemist", "Hunter x Hunter (2011)", "Steins;Gate", "Code Geass: Lelouch of the Rebellion", "Death Note"]
    precision, recall, f1 = evaluate_recommendations(recommendations, ground_truth, k)
    results.append({'anime': anime_name, 'precision': precision, 'recall': recall, 'f1': f1})


eval_df = pd.DataFrame(results)
avg_precision = eval_df['precision'].mean()
avg_recall = eval_df['recall'].mean()
avg_f1 = eval_df['f1'].mean()

print(f"\nAverage Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-score: {avg_f1:.4f}")

It is more of memory based on given data, Adding new Anime will make our model struggle.

## Interview Questions


**1.**   **Can you explain the difference between user-based and item-based collaborative filtering?**


**User-based**
*   Focuses on finding similar users to target users.
*   Recommend items that similar users have rated highly.

**Item-based**
*   Focuses on finding similar items to those user has liked / rated highly.
*   Recommend items similar to user's previously liked items.

**2.**   **What is collaborative filtering, and how does it work?**

Collaborative filtering is a technique used in recommendation systems to make automatic predictions about a user's interests by collecting preferences from many users.
