# Data Preprocessing:

In [1]:
import pandas as pd

anime_df = pd.read_csv('anime.csv')
display(anime_df.head())

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [2]:
print("Missing values count per column:")
print(anime_df.isnull().sum())

print("\nMissing values percentage per column:")
print((anime_df.isnull().sum() / len(anime_df)) * 100)

Missing values count per column:
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

Missing values percentage per column:
anime_id    0.000000
name        0.000000
genre       0.504311
type        0.203351
episodes    0.000000
rating      1.870831
members     0.000000
dtype: float64


In [3]:
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
print("Missing values in 'rating' column after imputation:")
print(anime_df['rating'].isnull().sum())

Missing values in 'rating' column after imputation:
0


In [4]:
anime_df['genre'].fillna('Unknown', inplace=True)
print("Missing values in 'genre' column after imputation:")
print(anime_df['genre'].isnull().sum())

Missing values in 'genre' column after imputation:
0


In [5]:
anime_df['type'] = anime_df['type'].fillna(anime_df['type'].mode()[0])
print("Missing values in 'type' column after imputation:")
print(anime_df['type'].isnull().sum())

Missing values in 'type' column after imputation:
0


In [6]:
anime_df['episodes'] = anime_df['episodes'].fillna(anime_df['episodes'].mode()[0])
print("Missing values in 'episodes' column after imputation:")
print(anime_df['episodes'].isnull().sum())

Missing values in 'episodes' column after imputation:
0


In [7]:
print("Missing values after handling all columns:")
print(anime_df.isnull().sum())

Missing values after handling all columns:
anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [8]:
print("First 5 rows of the DataFrame:")
display(anime_df.head())

First 5 rows of the DataFrame:


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [9]:
print("DataFrame Information:")
anime_df.info()

DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [10]:
print("Descriptive statistics for numerical columns:")
display(anime_df.describe())

Descriptive statistics for numerical columns:


Unnamed: 0,anime_id,rating,members
count,12294.0,12294.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.017096,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.9,225.0
50%,10260.5,6.55,1550.0
75%,24794.5,7.17,9437.0
max,34527.0,10.0,1013917.0


In [11]:
print("Unique values and their counts for 'type' column:")
display(anime_df['type'].value_counts())

Unique values and their counts for 'type' column:


TV         3812
OVA        3311
Movie      2348
Special    1676
ONA         659
Music       488
Name: type, dtype: int64

# Feature Extraction:

In [12]:
features_df = anime_df[['genre', 'rating']]
print("First 5 rows of the features_df DataFrame:")
display(features_df.head())

First 5 rows of the features_df DataFrame:


Unnamed: 0,genre,rating
0,"Drama, Romance, School, Supernatural",9.37
1,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26
2,"Action, Comedy, Historical, Parody, Samurai, S...",9.25
3,"Sci-Fi, Thriller",9.17
4,"Action, Comedy, Historical, Parody, Samurai, S...",9.16


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
print("CountVectorizer imported successfully.")

CountVectorizer imported successfully.


In [14]:
vectorizer = CountVectorizer()
genre_matrix = vectorizer.fit_transform(features_df['genre'].fillna(''))

genre_df = pd.DataFrame(genre_matrix.toarray(), columns=vectorizer.get_feature_names_out())

features_df = pd.concat([features_df.drop('genre', axis=1), genre_df], axis=1)
print("First 5 rows of features_df after genre encoding:")
display(features_df.head())

First 5 rows of features_df after genre encoding:


Unnamed: 0,rating,action,adventure,ai,arts,cars,comedy,dementia,demons,drama,...,slice,space,sports,super,supernatural,thriller,unknown,vampire,yaoi,yuri
0,9.37,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,9.26,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,9.25,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9.17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,9.16,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
type_dummies = pd.get_dummies(anime_df['type'], prefix='type', dtype=int)
features_df = pd.concat([features_df, type_dummies], axis=1)

print("First 5 rows of features_df after type encoding:")
display(features_df.head())

First 5 rows of features_df after type encoding:


Unnamed: 0,rating,action,adventure,ai,arts,cars,comedy,dementia,demons,drama,...,unknown,vampire,yaoi,yuri,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,9.37,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,9.26,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,9.25,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,9.17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,9.16,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
features_df['rating'] = scaler.fit_transform(features_df[['rating']])

print("First 5 rows of features_df after scaling 'rating' column:")
display(features_df.head())

First 5 rows of features_df after scaling 'rating' column:


Unnamed: 0,rating,action,adventure,ai,arts,cars,comedy,dementia,demons,drama,...,unknown,vampire,yaoi,yuri,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,0.92437,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,0.911164,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0.909964,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.90036,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.89916,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Recommendation System:

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
print("cosine_similarity imported successfully.")

cosine_similarity imported successfully.


In [18]:
cosine_sim = cosine_similarity(features_df)
print("Cosine similarity matrix calculated successfully.")

Cosine similarity matrix calculated successfully.


In [19]:
print("Shape of the cosine similarity matrix:", cosine_sim.shape)

Shape of the cosine similarity matrix: (12294, 12294)


In [20]:
def recommend_anime(anime_id, cosine_sim, anime_df, N=10):
    # Get the index of the anime that matches the anime_id
    idx = anime_df[anime_df['anime_id'] == anime_id].index[0]

    # Get the similarity scores for that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar anime (excluding itself)
    sim_scores = sim_scores[1:N+1]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top N most similar anime
    return anime_df.iloc[anime_indices]

print("recommend_anime function defined successfully.")

recommend_anime function defined successfully.


In [21]:
sample_anime_id = anime_df['anime_id'].iloc[0] # Using the first anime in the dataframe as an example
print(f"Recommendations for anime_id {sample_anime_id} ({anime_df[anime_df['anime_id'] == sample_anime_id]['name'].iloc[0]}):")
recommendations = recommend_anime(sample_anime_id, cosine_sim, anime_df)
display(recommendations)

Recommendations for anime_id 32281 (Kimi no Na wa.):


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1111,14669,Aura: Maryuuin Kouga Saigo no Tatakai,"Comedy, Drama, Romance, School, Supernatural",Movie,1,7.67,22599
208,28725,Kokoro ga Sakebitagatterunda.,"Drama, Romance, School",Movie,1,8.32,59652
1494,20903,Harmonie,"Drama, School, Supernatural",Movie,1,7.52,29029
1959,713,Air Movie,"Drama, Romance, Supernatural",Movie,1,7.39,44179
60,10408,Hotarubi no Mori e,"Drama, Romance, Shoujo, Supernatural",Movie,1,8.61,197439
1199,6408,&quot;Bungaku Shoujo&quot; Movie,"Drama, Mystery, Romance, School",Movie,1,7.63,40984
2103,1723,Clannad Movie,"Drama, Fantasy, Romance, School",Movie,1,7.35,99506
11082,33036,Suki ni Naru Sono Shunkan wo.: Kokuhaku Jikkou...,"Comedy, Drama, Romance, School",Movie,1,6.473902,10668
5805,547,Wind: A Breath of Heart OVA,"Drama, Romance, School, Supernatural",OVA,3,6.35,2043
894,10389,Momo e no Tegami,"Drama, Supernatural",Movie,1,7.78,30519


In [22]:
target_anime_id = anime_df['anime_id'].iloc[0] # Example: Kimi no Na wa.
target_anime_idx = anime_df[anime_df['anime_id'] == target_anime_id].index[0]

print(f"Target Anime ID: {target_anime_id}")
print(f"Target Anime Index: {target_anime_idx}")

Target Anime ID: 32281
Target Anime Index: 0


In [23]:
target_sim_scores = list(enumerate(cosine_sim[target_anime_idx]))
print("Extracted similarity scores for the target anime.")

Extracted similarity scores for the target anime.


In [24]:
sorted_sim_scores = sorted(target_sim_scores, key=lambda x: x[1], reverse=True)

# Experiment with a threshold (e.g., 0.7)
threshold = 0.7

# Filter recommendations based on the threshold, excluding the target anime itself
filtered_recommendations = [
    (idx, score) for idx, score in sorted_sim_scores 
    if score >= threshold and idx != target_anime_idx
]

print(f"Number of recommendations with similarity score >= {threshold}: {len(filtered_recommendations)}")
print("First 5 filtered similarity scores:")
print(filtered_recommendations[:5])

Number of recommendations with similarity score >= 0.7: 55
First 5 filtered similarity scores:
[(1111, 0.9171375481038241), (208, 0.9093124159019615), (1494, 0.9064711070578456), (1959, 0.9058460727108747), (60, 0.8261772427617735)]


In [25]:
recommended_anime_indices = [idx for idx, _ in filtered_recommendations]
final_recommendations = anime_df.iloc[recommended_anime_indices]

print(f"Recommendations for '{anime_df.loc[target_anime_idx, 'name']}' with similarity score >= {threshold}:")
display(final_recommendations)

Recommendations for 'Kimi no Na wa.' with similarity score >= 0.7:


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1111,14669,Aura: Maryuuin Kouga Saigo no Tatakai,"Comedy, Drama, Romance, School, Supernatural",Movie,1,7.67,22599
208,28725,Kokoro ga Sakebitagatterunda.,"Drama, Romance, School",Movie,1,8.32,59652
1494,20903,Harmonie,"Drama, School, Supernatural",Movie,1,7.52,29029
1959,713,Air Movie,"Drama, Romance, Supernatural",Movie,1,7.39,44179
60,10408,Hotarubi no Mori e,"Drama, Romance, Shoujo, Supernatural",Movie,1,8.61,197439
1199,6408,&quot;Bungaku Shoujo&quot; Movie,"Drama, Mystery, Romance, School",Movie,1,7.63,40984
2103,1723,Clannad Movie,"Drama, Fantasy, Romance, School",Movie,1,7.35,99506
11082,33036,Suki ni Naru Sono Shunkan wo.: Kokuhaku Jikkou...,"Comedy, Drama, Romance, School",Movie,1,6.473902,10668
5805,547,Wind: A Breath of Heart OVA,"Drama, Romance, School, Supernatural",OVA,3,6.35,2043
894,10389,Momo e no Tegami,"Drama, Supernatural",Movie,1,7.78,30519


# Interview Questions:

In [26]:
#  1. What is Collaborative Filtering and How Does It Work?

#Collaborative Filtering (CF) is a technique used by recommendation engines (like Netflix, Spotify, or Amazon) to predict what a user might like based on the preferences of many users.

#The core philosophy is: "If Person A and Person B agree on one issue, they are likely to agree on others."
#How It Works:

#Unlike Content-Based Filtering, which looks at the features of an item (e.g., "this movie is a Western"), Collaborative Filtering doesn't need to know anything about the items themselves. It only looks at the user-item interactions, such as:

 #   Ratings (1-5 stars)

  #  Clicks or Views

   # Purchases

#It creates a User-Item Matrix where rows represent users and columns represent items. Most cells are empty (since no one buys everything), and the algorithm’s job is to fill in those blanks using mathematical similarity.

In [27]:
#2. User-Based vs. Item-Based Collaborative Filtering

#While both methods use the same underlying data, they approach the "similarity" calculation from different angles.
#User-Based Collaborative Filtering

#This method finds users who are similar to you and recommends items they liked that you haven't seen yet.

#    The Logic: "Users who are like you also bought..."
#
#    The Process: 1. Look at User A’s history.
 #   2. Find "neighbors" (User B, C, D) who have high correlation in their ratings.
  #  3. Recommend items that those neighbors rated highly but User A hasn't interacted with.

#Item-Based Collaborative Filtering

#Instead of looking for similar people, this looks for items that are frequently liked by the same group of people.

 #   The Logic: "Users who liked this item also liked..."

  #  The Process:

   #     Look at Item X (which the user just liked).

    #    Find other items (Item Y, Z) that have similar rating patterns across the entire database.

     #   Recommend those similar items