# Recommendation system 



In [2]:
#Data preprcoessing 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv("C:\\Users\\anwes\\OneDrive\\Desktop\\assignment\\Recommendation System\\Recommendation System\\anime.csv")

# Display first rows
print(data.head())


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          GintamaÂ°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [4]:
# Handle missing values

data['genre'] = data['genre'].fillna('')
data['rating'] = data['rating'].fillna(data['rating'].mean())
data['type'] = data['type'].fillna('Unknown')
data['episodes'] = data['episodes'].replace('Unknown', 0)
data['episodes'] = pd.to_numeric(data['episodes'], errors='coerce').fillna(0)

In [5]:
# Text Feature: Genre + Type Together
data['combined_features'] = data['genre'] + " " + data['type']

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

print("TF-IDF shape:", tfidf_matrix.shape)

TF-IDF shape: (12294, 52)


In [6]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(data[['rating', 'episodes', 'members']])

# Combine all features
final_features = np.hstack((tfidf_matrix.toarray(), numerical_features))

In [7]:
# Compute cosine similarity
cosine_sim = cosine_similarity(final_features, final_features)

# Function to recommend anime
def recommend_anime(title, num_recommendations=5):
    if title not in data['name'].values:
        return "Anime not found in dataset."
    
    # Get index of the anime
    idx = data[data['name'] == title].index[0]
    
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude the anime itself
    sim_scores = sim_scores[1:num_recommendations+1]
    
    recommendations = []
    for i, score in sim_scores:
        recommendations.append((data.iloc[i]['name'], score))
    
    return recommendations


In [8]:
# Evalution
from sklearn.preprocessing import MultiLabelBinarizer

data['genre_list'] = data['genre'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(data['genre_list'])

In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.3, random_state=42)


In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_system():
    precisions = []
    recalls = []
    f1s = []

    for title in test['name'].sample(20):  # sample 20 to reduce runtime
        recs = recommend_anime(title, 5)
        if isinstance(recs, str): continue
        
        idx = data[data['name'] == title].index[0]
        true_genres = genre_matrix[idx]
        
        predicted_genres = [genre_matrix[data[data['name']==r[0]].index[0]] for r in recs]
        
        if len(predicted_genres) == 0: continue
        predicted_genres = sum(predicted_genres) > 0
        
        precisions.append(precision_score(true_genres, predicted_genres, zero_division=0))
        recalls.append(recall_score(true_genres, predicted_genres, zero_division=0))
        f1s.append(f1_score(true_genres, predicted_genres, zero_division=0))
    
    return {
        "Precision": sum(precisions)/len(precisions),
        "Recall": sum(recalls)/len(recalls),
        "F1-score": sum(f1s)/len(f1s)
    }

evaluate_system()


{'Precision': 0.8719444444444445,
 'Recall': 1.0,
 'F1-score': 0.9220634920634921}


#### Can you explain the difference between user-based and item-based collaborative filtering?
user-based finds similar users and recommends items they liked, while item-based finds similar items and recommends them based on the target user's history


#### What is Collaborative Filtering, and how does it work?
Collaborative Filtering (CF) is a recommendation technique that predicts what a user will like based on the preferences of similar users or similar items.

How it Works

Collaborative filtering uses the idea:

People who behave similarly will like similar things.

It analyzes:

User past ratings

User interactions (views, clicks, purchases)
Similarity between users or items
Then it gives recommendations like:
Users similar to you liked this movie.
Items similar to the ones you liked.