# Data Preprocessing

In [1]:
import pandas as pd

# Load the dataset
anime_df = pd.read_csv('anime.csv')

# Display the first few rows of the dataframe
print(anime_df.head())

# Check for missing values
missing_values = anime_df.isnull().sum()
print("Missing values:\n", missing_values)

# Fill missing values
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)

# Verify if all missing values have been handled
missing_values_after_cleaning = anime_df.isnull().sum()
print("Missing values after cleaning:\n", missing_values_after_cleaning)


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
Missing values:
 anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64
Miss

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['genre'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['type'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

# Feature Extraction

In [2]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# One-hot encode the 'genre' column
onehot_encoder = OneHotEncoder()
genre_encoded = onehot_encoder.fit_transform(anime_df[['genre']]).toarray()
genre_df = pd.DataFrame(genre_encoded, columns=onehot_encoder.get_feature_names_out(['genre']))

# Normalize the 'members' column
scaler = MinMaxScaler()
members_normalized = scaler.fit_transform(anime_df[['members']])
members_df = pd.DataFrame(members_normalized, columns=['members_normalized'])

# Combine the original dataframe with the encoded and normalized features
anime_features_df = pd.concat([anime_df[['anime_id', 'name', 'rating']], genre_df, members_df], axis=1)

# Display the first few rows of the processed dataframe
print(anime_features_df.head())


   anime_id                              name  rating  genre_Action  \
0     32281                    Kimi no Na wa.    9.37           0.0   
1      5114  Fullmetal Alchemist: Brotherhood    9.26           0.0   
2     28977                          Gintama°    9.25           0.0   
3      9253                       Steins;Gate    9.17           0.0   
4      9969                     Gintama&#039;    9.16           0.0   

   genre_Action, Adventure  \
0                      0.0   
1                      0.0   
2                      0.0   
3                      0.0   
4                      0.0   

   genre_Action, Adventure, Cars, Comedy, Sci-Fi, Shounen  \
0                                                0.0        
1                                                0.0        
2                                                0.0        
3                                                0.0        
4                                                0.0        

   genre_Action, Adventur

# Recommendation System

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_anime(anime_name, num_recommendations=5):
    # Find the index of the target anime
    target_anime = anime_features_df[anime_features_df['name'] == anime_name].index[0]
    
    # Compute the cosine similarity matrix
    similarity_matrix = cosine_similarity(anime_features_df.drop(['anime_id', 'name'], axis=1))
    
    # Get the indices of the most similar animes
    similar_indices = similarity_matrix[target_anime].argsort()[::-1][1:num_recommendations + 1]
    
    # Return the recommended animes
    return anime_df.iloc[similar_indices][['name', 'rating', 'genre', 'type', 'episodes', 'members']]

# Example usage
recommendations = recommend_anime('Kimi no Na wa.')
print(recommendations)


                                          name  rating  \
5805               Wind: A Breath of Heart OVA    6.35   
6394              Wind: A Breath of Heart (TV)    6.14   
10464  Taka no Tsume 8: Yoshida-kun no X-Files   10.00   
10400              Spoon-hime no Swing Kitchen    9.60   
9595                          Mogura no Motoro    9.50   

                                      genre   type episodes  members  
5805   Drama, Romance, School, Supernatural    OVA        3     2043  
6394   Drama, Romance, School, Supernatural     TV       13     7778  
10464                        Comedy, Parody  Movie        1       13  
10400                       Adventure, Kids     TV  Unknown       47  
9595                          Slice of Life  Movie        1       62  


#  Evaluation

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)

# Assuming we have a function to get recommendations
# and a way to measure true positives, false positives, and false negatives


def evaluate_recommendation_system(test_anime_name):
    recommendations = recommend_anime(test_anime_name)
    recommended_anime_names = recommendations['name'].values
    
    # Calculate precision, recall, and F1-score
    true_positives = sum(test_df['name'].isin(recommended_anime_names))
    false_positives = len(recommended_anime_names) - true_positives
    false_negatives = len(test_df) - true_positives
    
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1

# evaluation
precision, recall, f1 = evaluate_recommendation_system('Kimi no Na wa.')
print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")


Precision: 0.2, Recall: 0.00040666937779585197, F1-score: 0.0008116883116883117
