In [1]:
#recommendation system

In [5]:
import pandas  as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

In [6]:
df=pd.read_csv('anime.csv')

In [7]:
df.shape

(12294, 7)

In [8]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [9]:
len(df.anime_id.unique())

12294

In [10]:
#missing values
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [11]:
# Fill missing genres with 'Unknown'
df['genre']=df['genre'].fillna('unknown')

In [12]:
# Fill missing rating with mean rating
df['rating'] = df['rating'].fillna(df['rating'].mean())

In [13]:
# Fill missing 'type' and 'episodes'
df['type'] = df['type'].fillna('Unknown')
df['episodes'] = df['episodes'].replace('Unknown', np.nan).astype(float)
df['episodes'] = df['episodes'].fillna(df['episodes'].median())

In [14]:
# Checking duplicates
df.duplicated().sum()

np.int64(0)

In [15]:
#Explore the dataset to understand its structure and attributes.

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  float64
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 672.5+ KB


In [17]:
df.describe()

Unnamed: 0,anime_id,episodes,rating,members
count,12294.0,12294.0,12294.0,12294.0
mean,14058.221653,12.095412,6.473902,18071.34
std,11455.294701,46.244062,1.017096,54820.68
min,1.0,1.0,1.67,5.0
25%,3484.25,1.0,5.9,225.0
50%,10260.5,2.0,6.55,1550.0
75%,24794.5,12.0,7.17,9437.0
max,34527.0,1818.0,10.0,1013917.0


In [18]:
#feature_extraction

In [19]:
# Convert genres (text) into numerical TF-IDF vectors
tfidf=TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genre'])

In [20]:
# Normalize numerical features (rating, members)
scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(df[['rating', 'members']])

In [21]:
#recommendation system

In [22]:
# Combine genre TF-IDF and numerical features into a single matrix
from scipy.sparse import hstack
combined_features = hstack([tfidf_matrix, numerical_features])

In [23]:
# Compute Cosine Similarity
similarity = cosine_similarity(combined_features, combined_features)
similarity

array([[1.        , 0.53235245, 0.46247873, ..., 0.24157166, 0.24807781,
        0.27820617],
       [0.53235245, 1.        , 0.51682949, ..., 0.20971947, 0.2153503 ,
        0.24148453],
       [0.46247873, 0.51682949, 1.        , ..., 0.24118659, 0.24768518,
        0.27776899],
       ...,
       [0.24157166, 0.20971947, 0.24118659, ..., 1.        , 0.99994581,
        0.99824985],
       [0.24807781, 0.2153503 , 0.24768518, ..., 0.99994581, 1.        ,
        0.99881138],
       [0.27820617, 0.24148453, 0.27776899, ..., 0.99824985, 0.99881138,
        1.        ]])

In [24]:
similarity.shape

(12294, 12294)

In [25]:
similarity[1]

array([0.53235245, 1.        , 0.51682949, ..., 0.20971947, 0.2153503 ,
       0.24148453])

In [26]:
#Design a function to recommend anime based on cosine similarity.

In [27]:
def recommend_anime(title, top_n=10, threshold=0.3):
    if title not in df['name'].values:
        return "Anime not found in dataset."
    idx = df[df['name'] == title].index[0]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    recommended_anime_with_scores = [(df['name'].iloc[i[0]], i[1]) for i in sim_scores if i[1] > threshold]
    return recommended_anime_with_scores

In [28]:
#Given a target anime, recommend a list of similar anime based on cosine similarity scores.

In [35]:
kimi_no_na_wa = df['name'].iloc[0]
print(f"\nRecommendations for '{kimi_no_na_wa}':\n")
print(recommend_anime(kimi_no_na_wa, top_n=5))


Recommendations for 'Kimi no Na wa.':

[('Wind: A Breath of Heart OVA', np.float64(0.9628328816787185)), ('Wind: A Breath of Heart (TV)', np.float64(0.9589029479221463)), ('Aura: Maryuuin Kouga Saigo no Tatakai', np.float64(0.958359297252002)), ('Shakugan no Shana II (Second)', np.float64(0.9178070215979311)), ('Angel Beats!: Another Epilogue', np.float64(0.9161215438910484))]


In [36]:
#Experiment with different threshold values for similarity scores to adjust the recommendation list size.

In [37]:
threshold_values = [0.2, 0.4, 0.6, 0.8]

for t in threshold_values:
    print(f"\n--- Recommendations for '{kimi_no_na_wa}' (threshold={t}) ---")
    recs = recommend_anime(kimi_no_na_wa, top_n=10, threshold=t)
    if recs: # Check if the list is not empty
        for anime, score in recs:
            print(f"- {anime}: {score:.4f}")
        print(f"Number of recommendations: {len(recs)}")
    else:
        print("No recommendations found above the threshold.")
        print(f"Number of recommendations: {len(recs)}")


--- Recommendations for 'Kimi no Na wa.' (threshold=0.2) ---
- Wind: A Breath of Heart OVA: 0.9628
- Wind: A Breath of Heart (TV): 0.9589
- Aura: Maryuuin Kouga Saigo no Tatakai: 0.9584
- Shakugan no Shana II (Second): 0.9178
- Angel Beats!: Another Epilogue: 0.9161
- Shakugan no Shana: 0.9146
- Shakugan no Shana S: 0.9076
- Harmonie: 0.9075
- Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen: 0.9023
- Kokoro ga Sakebitagatterunda.: 0.9011
Number of recommendations: 10

--- Recommendations for 'Kimi no Na wa.' (threshold=0.4) ---
- Wind: A Breath of Heart OVA: 0.9628
- Wind: A Breath of Heart (TV): 0.9589
- Aura: Maryuuin Kouga Saigo no Tatakai: 0.9584
- Shakugan no Shana II (Second): 0.9178
- Angel Beats!: Another Epilogue: 0.9161
- Shakugan no Shana: 0.9146
- Shakugan no Shana S: 0.9076
- Harmonie: 0.9075
- Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen: 0.9023
- Kokoro ga Sakebitagatterunda.: 0.9011
Number of recommendations: 10

--- Recommendations for 'Kimi no Na wa.' 

**Analyze the performance of the recommendation system and identify areas of improvement.**


This approach effectively captures direct word-level similarities and provides relevant recommendations based on content attributes.he system also lacks personalization, meaning all users receive the same recommendations for a given input, as it does not consider user preferences or historical viewing behavior.

In [38]:
#interview questions:

**Can you explain the difference between user-based and item-based collaborative filtering?**

**User-Based Collaborative Filtering**

Concept:
Recommends items to a user based on the preferences of similar users.

How it works:

Identify users who have similar tastes to the target user by comparing their past ratings or interactions (using metrics like cosine similarity or Pearson correlation).

Recommend items that these similar users liked but the target user hasn’t interacted with yet.

**Item-Based Collaborative Filtering**

Concept:
Recommends items similar to those the user has already liked, based on item-to-item similarities.

How it works:

Compute similarity between items using user ratings or interactions.

For a given user, recommend items that are most similar to the ones they have rated highly or interacted with.

**What is collaborative filtering, and how does it work?**

Collaborative filtering is a recommendation technique that suggests items to users based on the preferences or behaviors of other users. The key idea is that people who have shown similar interests in the past are likely to share similar preferences in the future. In other words, it “collaborates” across a community of users to filter and predict what an individual might like.