In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv(r'D:\AI assignment\anime.csv')

# Preview dataset
print("Original shape:", df.shape)
print(df.head())

# Drop rows with missing name or genre
df.dropna(subset=['name', 'genre'], inplace=True)

# Fill missing 'rating' and 'episodes' with median
df['rating'] = df['rating'].fillna(df['rating'].median())
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['episodes'] = df['episodes'].fillna(df['episodes'].median())

# Genre preprocessing
df['genre'] = df['genre'].apply(lambda x: [i.strip().lower() for i in x.split(',')])

# Use MultiLabelBinarizer to convert genre to one-hot
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(df['genre']), columns=mlb.classes_)

# One-hot encode the 'type' column (TV, Movie, etc.)
type_encoded = pd.get_dummies(df['type'].fillna('Unknown'))

# Normalize numeric features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df[['rating', 'members', 'episodes']])
scaled_df = pd.DataFrame(scaled_features, columns=['rating_scaled', 'members_scaled', 'episodes_scaled'])

# Combine all features
features_df = pd.concat([genre_encoded, type_encoded, scaled_df], axis=1)

# Compute cosine similarity
cosine_sim = cosine_similarity(features_df)

# Reset index mapping for lookup
df = df.reset_index(drop=True)

# Function to recommend anime based on a given anime name
def recommend_anime(title, top_n=10):
    if title not in df['name'].values:
        print(f"Anime '{title}' not found in the dataset.")
        return
    idx = df[df['name'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    anime_indices = [i[0] for i in sim_scores]
    print(f"\nTop {top_n} recommendations for '{title}':\n")
    print(df[['name', 'genre', 'type', 'rating']].iloc[anime_indices])

# Example usage
recommend_anime('Naruto')  # Replace with any anime name in your dataset


Original shape: (12294, 7)
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


ValueError: Input contains NaN.