In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df=pd.read_csv('anime.csv')

In [3]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df.shape

(12294, 7)

In [5]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [6]:
# filling the numerical missing values only
df['rating']=df['rating'].fillna(0)

In [7]:
df['episodes']=df['episodes'].replace('Unknown',0)

In [8]:
df['episodes']=df['episodes'].astype(int)

In [9]:
df.isnull().sum()

anime_id     0
name         0
genre       62
type        25
episodes     0
rating       0
members      0
dtype: int64

In [10]:
df.dropna(inplace=True)

In [11]:
df.shape

(12210, 7)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
new_genre=[]
for x in df['genre']:
    genre=''.join(x.split(','))
    new_genre.append(genre)
    

In [14]:
df['new_genre']=new_genre

In [15]:
tfidf=TfidfVectorizer()

In [16]:
genre_matrix=tfidf.fit_transform(df['new_genre'])

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scale=StandardScaler()

In [19]:
scaled_values=scale.fit_transform(df[['episodes','rating']])

In [20]:
scaled_values

array([[-0.23949859,  2.30692766],
       [ 1.11812483,  2.22217465],
       [ 0.83798032,  2.21446983],
       ...,
       [-0.17484985, -1.15253622],
       [-0.23949859, -1.07548803],
       [-0.23949859, -0.7056567 ]])

In [21]:
from scipy.sparse import hstack 

In [22]:
combined_features=hstack([genre_matrix,scaled_values])

In [23]:
combined_features

<COOrdinate sparse matrix of dtype 'float64'
	with 65988 stored elements and shape (12210, 49)>

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
cosine_sim=cosine_similarity(combined_features,combined_features)

In [26]:
np.fill_diagonal(cosine_sim,0)

In [27]:
cosine_df=pd.DataFrame(cosine_sim,index=df['anime_id'],columns=df['anime_id'])

In [61]:
def recommend(anime_id,threshold, n=5):
    
    sims = cosine_df.loc[anime_id]
    
    sims=sims[sims>=threshold]
    
    top_ids = sims.nlargest(n).index.tolist()
    

    return df[df['anime_id'].isin(top_ids)][['name']]


In [72]:
results=[]
thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
for i in range(len(thresholds)):
    recs=recommend(32281,thresholds[i])
    recs_list = recs['name'].tolist()  # convert DataFrame to list
    results.append({'Threshold': thresholds[i], 'Recommendations': recs_list})

results_df = pd.DataFrame(results)
print(results_df)

   Threshold                                    Recommendations
0        0.2  [Sen to Chihiro no Kamikakushi, Hotarubi no Mo...
1        0.3  [Sen to Chihiro no Kamikakushi, Hotarubi no Mo...
2        0.4  [Sen to Chihiro no Kamikakushi, Hotarubi no Mo...
3        0.5  [Sen to Chihiro no Kamikakushi, Hotarubi no Mo...
4        0.6  [Sen to Chihiro no Kamikakushi, Hotarubi no Mo...
5        0.7  [Sen to Chihiro no Kamikakushi, Hotarubi no Mo...


In [78]:
pd.set_option('display.max_colwidth', None)
results_df[results_df['Threshold']==0.2]['Recommendations']

0    [Sen to Chihiro no Kamikakushi, Hotarubi no Mori e, Angel Beats!, Kokoro ga Sakebitagatterunda., Yahari Ore no Seishun Love Comedy wa Machigatteiru. Zoku]
Name: Recommendations, dtype: object

1. Can you explain the difference between user-based and item-based collaborative filtering?

User-based collaborative filtering recommends items to a user based on the preferences of other users who are similar to them. The idea is that if two users have shown similar behavior in the past (such as rating or liking similar items), then they are likely to have similar preferences in the future. To generate recommendations, the system first finds users who are most similar to the target user and then suggests items that those similar users have liked but the target user has not interacted with yet. This approach works well when user behavior is stable but becomes less effective as the number of users grows because computing user-to-user similarity can be expensive.

Item-based collaborative filtering, on the other hand, focuses on the similarity between items rather than users. It recommends items that are similar to those the user has already liked or interacted with. The system computes item-to-item similarity based on how users have rated or interacted with those items. Since item similarities change less frequently than user preferences, this approach is more scalable and stable, which is why it is commonly used in real-world systems like Amazon and Netflix.

2. What is collaborative filtering, and how does it work?

Collaborative filtering is a recommendation technique that predicts a user’s interests by using the collective behavior of many users. Instead of relying on item features or content descriptions, it assumes that users who behaved similarly in the past will have similar preferences in the future. It uses interactions such as ratings, likes, purchases, or views to make recommendations.

Collaborative filtering works by first collecting user–item interaction data and then identifying patterns in this data. The system finds either similar users (user-based) or similar items (item-based) using similarity measures such as cosine similarity or Pearson correlation. Based on these similarities, the system recommends items that similar users liked or items similar to those the user has already interacted with.