In [38]:
# import necessary libraries
import pandas as pd
import numpy as np
from google.colab import files
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [39]:
df = pd.read_csv('anime.csv')
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [40]:
#Drop rows with missing values in 'genre', 'type', and 'rating'
df_cleaned = df.dropna(subset=['genre', 'type', 'rating'])

In [41]:
#Display basic information about the cleaned dataset
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12017 non-null  int64  
 1   name      12017 non-null  object 
 2   genre     12017 non-null  object 
 3   type      12017 non-null  object 
 4   episodes  12017 non-null  object 
 5   rating    12017 non-null  float64
 6   members   12017 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 751.1+ KB


In [42]:
df_cleaned.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [43]:
#feature Extraction

In [44]:
 #One-Hot Encode the 'genre' column
encoder = OneHotEncoder()
encoded_genres = encoder.fit_transform(df_cleaned[['genre']]).toarray()
encoded_genres_df = pd.DataFrame(encoded_genres, columns=encoder.get_feature_names_out(['genre']))

In [45]:
#Normalize the 'rating' and 'members' columns
scaler = StandardScaler()
df_cleaned[['rating', 'members']] = scaler.fit_transform(df_cleaned[['rating', 'members']])

In [46]:
#Combine the processed features into a single DataFrame
features_df = pd.concat([df_cleaned[['rating', 'members']], encoded_genres_df], axis=1)

In [47]:
#recomendation System

In [48]:
#Impute missing values (if any) in the features DataFrame
imputer = SimpleImputer(strategy='mean') #You can choose a different strategy if needed
features_df_imputed = pd.DataFrame(imputer.fit_transform(features_df), columns=features_df.columns)

#Compute Cosine Similarity
similarity_matrix = cosine_similarity(features_df_imputed)

In [49]:
#  Function to recommend anime based on cosine similarity
def recommend_anime(anime_id, similarity_matrix, df, threshold=0.5, num_recommendations=10): # Added num_recommendations argument with default value 10
    sim_scores = list(enumerate(similarity_matrix[anime_id]))
    sim_scores = [i for i in sim_scores if i[1] >= threshold]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[:num_recommendations] #Added this line to only include top num_recommendations
    return [df['name'].iloc[i[0]] for i in sim_scores]

In [50]:
# Example usage: Recommend anime similar to the first anime in the dataset
recommendations = recommend_anime(0, similarity_matrix, df_cleaned)
print("Recommendations:")
recommendations

Recommendations:


['Kimi no Na wa.',
 'Haikyuu!! Second Season',
 'Monogatari Series: Second Season',
 'Ookami Kodomo no Ame to Yuki',
 'Ansatsu Kyoushitsu (TV) 2nd Season',
 'Kuroko no Basket 3rd Season',
 'Suzumiya Haruhi no Shoushitsu',
 'Steins;Gate Movie: Fuka Ryouiki no Déjà vu',
 'Hajime no Ippo',
 'Hotarubi no Mori e']

In [55]:
#Evaluation

import warnings
warnings.filterwarnings("ignore")

#  Split the dataset into training and testing sets
train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)

# Evaluate the recommendation system with a sample of name
test_name = test_df['name'].sample(10) # Select a sample of name for testing

precision = []
recall = []
f1 = []

for target_name in test_name:
    # Added similarity_matrix and df_cleaned as arguments
    recommended_anime = recommend_anime(test_df[test_df['name'] == target_name].index[0], similarity_matrix, df_cleaned, threshold=0.7, num_recommendations=10)

    # Get the relevant anime from test_df based on target_name
    relevant_anime = test_df[test_df['name'] == target_name]['name'].tolist() # Convert relevant anime to a list

    # Create a boolean array indicating which anime in recommended_anime are in relevant_anime
    relevant_anime_bool = np.isin(recommended_anime, relevant_anime) # Check if recommended anime are in relevant anime

    # Calculate metrics if there are recommendations
    if relevant_anime_bool.any(): # Check if any recommended anime is relevant
        # Compare recommended_anime against relevant_anime
        p, r, f, _ = precision_recall_fscore_support(
            relevant_anime_bool, relevant_anime_bool, average='weighted' #Use the same array for y_true and y_pred
        )
        precision.append(p)
        recall.append(r)
        f1.append(f)

# Calculate average metrics
if precision and recall and f1:
    avg_precision = sum(precision) / len(precision)
    avg_recall = sum(recall) / len(recall)
    avg_f1 = sum(f1) / len(f1)
    print(f'Average Precision: {avg_precision:.3f}, Average Recall: {avg_recall:.3f}, Average F1-score: {avg_f1:.3f}')
else:
    print("No recommendations to evaluate.")

Average Precision: 1.000, Average Recall: 1.000, Average F1-score: 1.000
