In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

In [3]:
# 1.1 Load the dataset
df = pd.read_csv("anime.csv")

In [4]:
# 1.2 Handle missing values
df.fillna("", inplace=True)  # Fill NaN values with an empty string for text-based features
df.dropna(subset=["rating"], inplace=True)  # Remove rows where rating is missing

In [5]:
# 1.3 Explore dataset structure
print(df.head())
print('*'*50)
print(df.info())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes rating  \
0               Drama, Romance, School, Supernatural  Movie        1   9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64   9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51   9.25   
3                                   Sci-Fi, Thriller     TV       24   9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51   9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
**************************************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12294 entries, 0 to 12293
Data columns (total 7 col

In [6]:
# 2.1 Feature Selection - Using 'genre' and 'rating' for similarity calculation
def combine_features(row):
    return row["genre"] + " " + str(row["rating"])

df["combined_features"] = df.apply(combine_features, axis=1)

In [7]:
# 2.2 Convert categorical features (genres) into numerical representations using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
feature_matrix = vectorizer.fit_transform(df["combined_features"])

In [8]:
# 3.1 Compute cosine similarity matrix
cosine_sim = cosine_similarity(feature_matrix)

In [9]:
# 3.2 Recommendation function with similarity threshold
def recommend_anime(title, num_recommendations=5, similarity_threshold=0.2):
    if title not in df["name"].values:
        return "Anime not found in dataset"
    
    idx = df[df["name"] == title].index[0]
    
    # Get similarity scores for all anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Filter based on threshold
    sim_scores = [score for score in sim_scores if score[1] >= similarity_threshold]
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # Fetch recommended anime titles
    anime_indices = [i[0] for i in sim_scores]
    return df.iloc[anime_indices]["name"].tolist()

In [10]:
# 4.1 Evaluation - Splitting dataset into training and testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
# 4.2 Evaluating the Recommendation System
def evaluate_recommendation_system():
    y_true = []  # Actual (1 if similar, 0 if not)
    y_pred = []  # Predicted (1 if recommended, 0 if not)
    
    for title in test_df["name"].sample(50, random_state=42):  # Evaluating on a sample of 50 titles
        recommended_anime = recommend_anime(title, num_recommendations=5)
        actual_anime = train_df[train_df["genre"] == test_df[test_df["name"] == title]["genre"].values[0]]["name"].tolist()
        
        for anime in recommended_anime:
            if anime in actual_anime:
                y_true.append(1)
                y_pred.append(1)
            else:
                y_true.append(0)
                y_pred.append(1)
        
        for anime in actual_anime:
            if anime not in recommended_anime:
                y_true.append(1)
                y_pred.append(0)
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")

In [12]:
# Run evaluation
evaluate_recommendation_system()

Precision: 0.26, Recall: 0.01, F1-Score: 0.02


In [13]:
# Example usage
anime_name = "Naruto"  # Replace with an actual anime title from dataset
recommendations = recommend_anime(anime_name, similarity_threshold=0.3)
print(f"Recommended anime for '{anime_name}':", recommendations)

Recommended anime for 'Naruto': ['Iron Virgin Jun', 'Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono', 'Dragon Ball Super', 'Ikkitousen: Extravaganza Epoch', 'Tenjou Tenge']


# 1. Difference Between User-Based and Item-Based Collaborative Filtering

User-Based Collaborative Filtering: This method recommends items by finding users with similar tastes. If User A and User B both liked the same anime in the past, then an anime liked by User A might be recommended to User B.

Item-Based Collaborative Filtering: This method recommends items by finding similarities between items. If many users who watched Anime X also watched Anime Y, then Anime Y is recommended to a user who liked Anime X.

# 2. What is Collaborative Filtering and How Does It Work?

Collaborative Filtering is a technique used in recommendation systems that suggests items based on past user behavior. It works by analyzing patterns in user interactions, such as ratings, views, or purchases, and then predicting what a user might like based on similar users or items.