In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('anime.csv')

In [2]:
# Display first few rows
print(df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [3]:

# Check for missing values
print(df.isnull().sum())


anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [4]:

# Get basic info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


We can see:

genre has 62 missing values.

type has 25 missing values.

rating has 230 missing values.

episodes column is of type object (string), probably because it contains 'Unknown' or similar.



In [5]:
# Fill missing genre and type
df['genre'].fillna('Unknown', inplace=True)
df['type'].fillna('Unknown', inplace=True)

# Fill missing ratings with mean rating
df['rating'].fillna(df['rating'].mean(), inplace=True)

# Replace 'Unknown' in episodes with 0, then convert to numeric
df['episodes'] = df['episodes'].replace('Unknown', 0)
df['episodes'] = pd.to_numeric(df['episodes'])

# Check if any missing values left
print(df.isnull().sum())


anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['genre'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['type'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

# Split genres into lists
df['genre_list'] = df['genre'].apply(lambda x: x.split(', '))

# Multi-hot encode genres
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df['genre_list'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Select numerical features
numerical_features = df[['episodes', 'rating', 'members']].copy()

# Normalize numerical features
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(numerical_features)
numerical_df = pd.DataFrame(numerical_scaled, columns=numerical_features.columns)

# Combine genre and numerical features
final_features = pd.concat([genre_df, numerical_df], axis=1)

print(final_features.head())


   Action  Adventure  Cars  Comedy  Dementia  Demons  Drama  Ecchi  Fantasy  \
0       0          0     0       0         0       0      1      0        0   
1       1          1     0       0         0       0      1      0        1   
2       1          0     0       1         0       0      0      0        0   
3       0          0     0       0         0       0      0      0        0   
4       1          0     0       1         0       0      0      0        0   

   Game  ...  Super Power  Supernatural  Thriller  Unknown  Vampire  Yaoi  \
0     0  ...            0             1         0        0        0     0   
1     0  ...            0             0         0        0        0     0   
2     0  ...            0             0         0        0        0     0   
3     0  ...            0             0         1        0        0     0   
4     0  ...            0             0         0        0        0     0   

   Yuri  episodes    rating   members  
0     0  0.000550  0.9

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(final_features)

# Build a reverse mapping from anime name to index
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

def recommend_anime(title, top_n=5):
    if title not in indices:
        return "Anime not found in the dataset."
    
    idx = indices[title]
    
    # Get pairwise similarity scores for this anime
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    
    # Sort by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top_n most similar anime (excluding itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the indices of these anime
    anime_indices = [i[0] for i in sim_scores]
    
    # Return the titles
    return df['name'].iloc[anime_indices].tolist()

# Example usage
print(recommend_anime('Naruto', top_n=5))


['Naruto: Shippuuden', 'Naruto: Shippuuden Movie 4 - The Lost Tower', 'Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono', 'Boruto: Naruto the Movie', 'Naruto x UT']


In [11]:
def evaluate_recommendations(df_eval, top_n=5):
    match_counts = 0
    total = 0

    for title in df_eval['name'].head(20):  # Evaluate on first 20 as example
        recommended = recommend_anime(title, top_n=top_n)
        if isinstance(recommended, str):
            continue

        # Get genres of original anime
        original_row = df[df['name'] == title]
        if original_row.empty:
            continue
        original_genres = set(original_row['genre'].iloc[0].split(', '))

        for rec_title in recommended:
            rec_row = df[df['name'] == rec_title]
            if rec_row.empty:
                continue
            rec_genres = set(rec_row['genre'].iloc[0].split(', '))

            # Count if they share at least one genre
            if original_genres & rec_genres:
                match_counts += 1

        total += top_n

    precision_at_k = match_counts / total if total > 0 else 0
    return precision_at_k

# Example evaluation
print(f"Approximate Precision@5: {evaluate_recommendations(test_df, top_n=5):.2f}")


Approximate Precision@5: 1.00


# Collaborative Filtering – Key Interview Points  

---

## 1. Difference Between **User‑Based** and **Item‑Based** Collaborative Filtering  

| Feature | User‑Based CF | Item‑Based CF |
|---------|---------------|---------------|
| **What it searches for** | Users whose historical interactions are similar to the target user | Items that receive similar interaction patterns to the items the target user already likes |
| **Core idea** | “People who behave like you also liked … ” | “Items similar to what you liked … ” |
| **Similarity computed between** | Users (rows in the user‑item matrix) | Items (columns in the user‑item matrix) |
| **Prediction step** | Aggregate ratings from nearest‑neighbor users for an unseen item | Aggregate ratings of similar items already rated by the user |
| **Typical pros** | Can capture complex taste profiles | More stable when user base is large; easier to cache similarity matrix |
| **Typical cons** | Scalability drops as users ↑; cold‑start for new users | Struggles with brand‑new items; assumes item similarity is stable |

---

## 2. What Is Collaborative Filtering, and How Does It Work?  

1. **Collect interaction data** – e.g. ratings, purchases, clicks.  
2. **Construct a user‑item matrix** – rows = users, columns = items, cells = interactions.  
3. **Measure similarity** – cosine, Pearson, Jaccard, etc., either **between users** or **between items**.  
4. **Predict unknown interactions** – use weighted average from nearest neighbors.  
5. **Recommend** – rank the top‑N highest predicted items the user has not yet seen.  

*Variants*  
- **Memory‑based** (nearest‑neighbor as above).  
- **Model‑based** (matrix factorization, neural CF, factorization machines).  

---
