In [22]:
import pandas as pd
# Loading the dataset
data = pd.read_csv('anime.csv')
#information about the dataset
print(data.info())
# Handle missing values
data = data.dropna(subset=['name', 'genre', 'rating', 'members'])
#dataset
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Advent

In [23]:
#Feature Extraction
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
data['genre'] = data['genre'].apply(lambda x: x.split(', '))
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(data['genre'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)
features = pd.concat([data[['rating', 'members']], genre_df], axis=1)
# Normalize numerical features
scaler = StandardScaler()
features[['rating', 'members']] = scaler.fit_transform(features[['rating', 'members']])

print(features.head())

     rating    members  Action  Adventure  Cars  Comedy  Dementia  Demons  \
0  2.824474   3.292044     0.0        0.0   0.0     0.0       0.0     0.0   
1  2.717032  14.002410     1.0        1.0   0.0     0.0       0.0     0.0   
2  2.707265   1.732216     1.0        0.0   0.0     1.0       0.0     0.0   
3  2.629126  11.833499     0.0        0.0   0.0     0.0       0.0     0.0   
4  2.619358   2.400518     1.0        0.0   0.0     1.0       0.0     0.0   

   Drama  Ecchi  ...  Shounen Ai  Slice of Life  Space  Sports  Super Power  \
0    1.0    0.0  ...         0.0            0.0    0.0     0.0          0.0   
1    1.0    0.0  ...         0.0            0.0    0.0     0.0          0.0   
2    0.0    0.0  ...         0.0            0.0    0.0     0.0          0.0   
3    0.0    0.0  ...         0.0            0.0    0.0     0.0          0.0   
4    0.0    0.0  ...         0.0            0.0    0.0     0.0          0.0   

   Supernatural  Thriller  Vampire  Yaoi  Yuri  
0           1

In [24]:
#Recommendation System
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
# Create a DataFrame for encoded genres
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)
# Combine with the original data
features = pd.concat([data[['rating', 'members']], genre_df], axis=1)
# Check for NaN values in features
print("Checking for NaN values in features:")
print(features.isnull().sum())
features = features.dropna()
# Normalize numerical features
scaler = StandardScaler()
features[['rating', 'members']] = scaler.fit_transform(features[['rating', 'members']])
cosine_sim = cosine_similarity(features)

def recommend_anime(target_anime, cosine_sim, data, top_n=5):
    # Get the index of the target anime
    idx = data[data['name'] == target_anime].index
    if len(idx) == 0:
        print(f"Anime '{target_anime}' not found in the dataset.")
        return []
    idx = idx[0]

    # Get the pairwise similarity scores for all anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top_n most similar anime
    sim_scores = sim_scores[1:top_n + 1]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top_n most similar anime
    return data['name'].iloc[anime_indices]
recommended_anime = recommend_anime('Naruto', cosine_sim, data, top_n=5)
print(recommended_anime)


Checking for NaN values in features:
rating           267
members          267
Action           267
Adventure        267
Cars             267
Comedy           267
Dementia         267
Demons           267
Drama            267
Ecchi            267
Fantasy          267
Game             267
Harem            267
Hentai           267
Historical       267
Horror           267
Josei            267
Kids             267
Magic            267
Martial Arts     267
Mecha            267
Military         267
Music            267
Mystery          267
Parody           267
Police           267
Psychological    267
Romance          267
Samurai          267
School           267
Sci-Fi           267
Seinen           267
Shoujo           267
Shoujo Ai        267
Shounen          267
Shounen Ai       267
Slice of Life    267
Space            267
Sports           267
Super Power      267
Supernatural     267
Thriller         267
Vampire          267
Yaoi             267
Yuri             267
dtype: int64
615  

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Sample true labels (these should be representative of what you expect for evaluation)
true_labels = ['One Piece', 'Bleach', 'Death Note']  # Replace with appropriate ground truth values

# Get predicted recommendations for a specific anime (ensure it has enough recommendations)
predicted_labels = recommend_anime('Naruto', cosine_sim, data, top_n=5).tolist()

# To evaluate, we need to align the lengths of true_labels and predicted_labels
# If the lengths are inconsistent, you might want to adjust the number of true labels or predicted labels

# Example: Ensure true_labels have the same number of elements as predicted_labels
# If you have less true labels than predicted, you can either slice predicted_labels or vice versa
min_length = min(len(true_labels), len(predicted_labels))
true_labels = true_labels[:min_length]
predicted_labels = predicted_labels[:min_length]

# Assuming the labels are binary (1 for match, 0 for no match)
y_true = [1 if label in true_labels else 0 for label in predicted_labels]
y_pred = [1] * len(predicted_labels)  # All predicted are considered as positive

# Compute evaluation metrics
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

# Print evaluation metrics
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')


Precision: 0.33, Recall: 1.00, F1 Score: 0.50


In [28]:
1.User-based Collaborative Filtering:Recommends items to users based on the preferences of similar users. It identifies users who are similar to the target user and recommends items they likes
Item-based collaborative Filtering:Recommends items based on the similarity between items. It identifies items that are similar to what the user has liked in the past and suggests thos
2.Collaborative Filtering is a technique used in recommendation systems that makes predictions about users interests by collecting preferences from many users
It works by analyzing the patterns of users and items, levaraging similarities among users or items to suggest new ones

SyntaxError: invalid decimal literal (818687401.py, line 1)