# snehal Jagtap

### Recommendation System

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

###  Step 1: Data Preprocessing

In [None]:
# Load the dataset
anime_df = pd.read_csv('anime.csv')

In [None]:
# Display the first few rows of the dataframe
print(anime_df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [None]:
# Check for missing values
print(anime_df.isnull().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [None]:
# Display data types of all columns
print(anime_df.dtypes)

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object


In [None]:
# Check unique values of 'episodes' column
print("Unique values in 'episodes' column:")
print(anime_df['episodes'].unique())

Unique values in 'episodes' column:
['1' '64' '51' '24' '10' '148' '110' '13' '201' '25' '22' '75' '4' '26'
 '12' '27' '43' '74' '37' '2' '11' '99' 'Unknown' '39' '101' '47' '50'
 '62' '33' '112' '23' '3' '94' '6' '8' '14' '7' '40' '15' '203' '77' '291'
 '120' '102' '96' '38' '79' '175' '103' '70' '153' '45' '5' '21' '63' '52'
 '28' '145' '36' '69' '60' '178' '114' '35' '61' '34' '109' '20' '9' '49'
 '366' '97' '48' '78' '358' '155' '104' '113' '54' '167' '161' '42' '142'
 '31' '373' '220' '46' '195' '17' '1787' '73' '147' '127' '16' '19' '98'
 '150' '76' '53' '124' '29' '115' '224' '44' '58' '93' '154' '92' '67'
 '172' '86' '30' '276' '59' '72' '330' '41' '105' '128' '137' '56' '55'
 '65' '243' '193' '18' '191' '180' '91' '192' '66' '182' '32' '164' '100'
 '296' '694' '95' '68' '117' '151' '130' '87' '170' '119' '84' '108' '156'
 '140' '331' '305' '300' '510' '200' '88' '1471' '526' '143' '726' '136'
 '1818' '237' '1428' '365' '163' '283' '71' '260' '199' '225' '312' '240'
 '1306' '15

In [None]:
# Check unique values of 'rating' column
print("Unique values in 'rating' column:")
print(anime_df['rating'].unique())

Unique values in 'rating' column:
[ 9.37        9.26        9.25        9.17        9.16        9.15
  9.13        9.11        9.1         9.06        9.05        9.04
  8.98        8.93        8.92        8.88        8.84        8.83
  8.82        8.81        8.8         8.78        8.77        8.76
  8.75        8.74        8.73        8.72        8.71        8.69
  8.68        8.67        8.66        8.65        8.64        8.62
  8.61        8.6         8.59        8.58        8.57        8.56
  8.55        8.54        8.53        8.52        8.51        8.5
  8.49        8.48        8.47        8.46        8.45        8.44
  8.43        8.42        8.41        8.4         8.39        8.38
  8.37        8.36        8.35        8.34        8.33        8.32
  8.31        8.3         8.29        8.28        8.27        8.26
  8.25        8.24        8.23        8.22        8.21        8.2
  8.19        8.18        8.17        8.16        8.15        8.14
  8.13        8.12        8.11

In [None]:
# Convert 'episodes' column to numeric, coercing errors to NaN
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')

In [None]:
# Convert 'rating' column to numeric, coercing errors to NaN
anime_df['rating'] = pd.to_numeric(anime_df['rating'], errors='coerce')

In [None]:
# Fill NaN values again, as non-numeric conversions will be set to NaN
anime_df['episodes'].fillna(anime_df['episodes'].mean(), inplace=True)
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)

In [None]:
# Check if there are any remaining NaNs or non-numeric values
print(anime_df.isnull().sum())

anime_id     0
name         0
genre        0
type        25
episodes     0
rating       0
members      0
dtype: int64


In [None]:
# Normalize the ratings column
scaler = MinMaxScaler()
anime_df['rating_normalized'] = scaler.fit_transform(anime_df[['rating']])

In [None]:
# Check the first few rows to ensure the changes
print(anime_df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type  episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie       1.0    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV      64.0    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV      51.0    9.25   
3                                   Sci-Fi, Thriller     TV      24.0    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV      51.0    9.16   

   members  rating_normalized  
0   200630           0.924370  
1   793665           0.911164  
2   114262           0.909964  
3   673572           0.900360  
4   151266           0.899160  


In [None]:
# Fill missing values (for demonstration, we'll use simple filling; customize as needed)
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
anime_df['episodes'].fillna(anime_df['episodes'].mean(), inplace=True)

In [None]:
# Check again for missing values
print(anime_df.isnull().sum())

anime_id              0
name                  0
genre                 0
type                 25
episodes              0
rating                0
members               0
rating_normalized     0
dtype: int64


### Step 2: Feature Extraction

In [None]:
# Combine relevant features into a single string for vectorization (e.g., genres, title)
if 'name' in anime_df.columns:
    anime_df['combined_features'] = anime_df['name'] + ' ' + anime_df['genre']
elif 'anime_title' in anime_df.columns:
    anime_df['combined_features'] = anime_df['anime_title'] + ' ' + anime_df['genre']
else:
    print("Title column not found. Please verify the dataset structure.")

In [None]:
print(anime_df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type  episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie       1.0    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV      64.0    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV      51.0    9.25   
3                                   Sci-Fi, Thriller     TV      24.0    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV      51.0    9.16   

   members  rating_normalized  \
0   200630           0.924370   
1   793665           0.911164   
2   114262           0.909964   
3   673572           0.900360   
4   151266           0.899160   

                   

In [None]:
# Convert the combined features to a matrix of token counts
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_df['combined_features'])

In [None]:
# Normalize the ratings column
scaler = MinMaxScaler()
anime_df['rating_normalized'] = scaler.fit_transform(anime_df[['rating']])

### Step 3: Recommendation System

In [None]:
# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Function to recommend anime based on cosine similarity
# Function to recommend anime based on cosine similarity
def recommend_anime(title, cosine_sim=cosine_sim, df=anime_df):
    # Check which column contains the anime title
    title_column = 'title'  # Update this to the correct column name if needed

    if 'name' in df.columns:
        title_column = 'name'
    elif 'anime_title' in df.columns:
        title_column = 'anime_title'

    # Get the index of the anime that matches the title
    try:
        idx = df[df[title_column] == title].index[0]
    except IndexError:
        return f"Anime titled '{title}' not found in the dataset."

    # Get the pairwise similarity scores of all anime with the given anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar anime (excluding itself)
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime
    return df[title_column].iloc[anime_indices]

In [None]:
# Test the recommendation system
print(recommend_anime('Naruto'))

719                            The Last: Naruto the Movie
615                                    Naruto: Shippuuden
1343                                          Naruto x UT
486                              Boruto: Naruto the Movie
2458                 Naruto Shippuuden: Sunny Side Battle
784            Naruto: Shippuuden Movie 6 - Road to Ninja
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
2416    Naruto: Honoo no Chuunin Shiken! Naruto vs. Ko...
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
Name: name, dtype: object


### Step 4: Evaluation (dummy implementation for structure)

1. Create a User-Anime Interaction Data

In [None]:
# Creating a dummy user interaction dataset if not available
anime_df['user_id'] = np.random.randint(1, 50, size=len(anime_df))  # Simulating user_id
anime_df['anime_id'] = anime_df.index  # Using index as anime_id for this example
anime_df['rating'] = pd.to_numeric(anime_df['rating'], errors='coerce')  # Ensure ratings are numeric

# Display a few rows of this user interaction data
print(anime_df[['user_id', 'anime_id', 'rating']].head())

   user_id  anime_id  rating
0        8         0    9.37
1        1         1    9.26
2       47         2    9.25
3       32         3    9.17
4       12         4    9.16


2. Train-Test Split

In [None]:
# Splitting the dataset into training and testing sets
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)

# Display sizes
print("Training set size:", len(train_df))
print("Testing set size:", len(test_df))

Training set size: 9835
Testing set size: 2459


3. Build the Recommendation System

In [None]:
# Rebuild the cosine similarity matrix using only the training set
tfidf_matrix_train = tfidf.fit_transform(train_df['combined_features'])
cosine_sim_train = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)

4. Generate Recommendations

In [None]:
# Updating the recommendation function to use the training cosine similarity
def recommend_anime_from_train(title, df=train_df, cosine_sim=cosine_sim_train):
    title_column = 'name'  # Replace with correct column name if needed
    if 'name' in df.columns:
        title_column = 'name'
    elif 'anime_title' in df.columns:
        title_column = 'anime_title'

    # Return empty list if title not found
    if title not in df[title_column].values:
        return []

    # Get the index of the anime that matches the title
    idx = df[df[title_column] == title].index[0]

    # Get the pairwise similarity scores of all anime with the given anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar anime (excluding itself)
    sim_scores = sim_scores[1:11]  # Exclude itself

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime as a list
    return df[title_column].iloc[anime_indices].tolist()

In [None]:
# Generate recommendations for testing purposes
print(recommend_anime_from_train('Naruto'))

['Ginga Hyouryuu Vifam', 'Ginga Shounen Tai', 'Ginga Shippuu Sasuraiger', 'Ginga Reppuu Baxingar', 'Ginga Hyouryuu Vifam 13', 'Ginga Nagareboshi Gin', 'Macross 7 Movie: Ginga ga Ore wo Yondeiru!', 'Ginga Densetsu Weed', 'Ginga Hyouryuu Vifam: Kieta 12-nin', 'Ginga Tetsudou 999']


5. Evaluate the System

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
# Placeholders for evaluation
y_true = []  # Actual liked/disliked based on rating threshold, e.g., 3.5
y_pred = []  # Recommended or not

In [None]:
# For each anime in test set, check if it is in the recommended list

for _, row in test_df.iterrows():
    recommended_anime = recommend_anime_from_train(row['name'])  # List of recommended anime titles

    # Append 1 if user liked this anime, 0 otherwise
    y_true.append(1 if row['rating'] >= 3.5 else 0)  # Assuming 3.5 as like threshold

    # Check if the anime is in the recommended list
    y_pred.append(1 if row['name'] in recommended_anime else 0)

In [None]:
# Iterate through the test dataset
for index, row in test_df.iterrows():
    # Get recommended anime for the current row using the training set
    recommended_anime = recommend_anime_from_train(row['name'])  # List of recommended anime titles

    # Append the actual value: 1 if rating >= 3.5 (liked), 0 otherwise
    y_true.append(1 if row['rating'] >= 3.5 else 0)

    # Check if the anime is in the recommended list; append 1 (recommended) or 0 (not recommended)
    y_pred.append(1 if row['name'] in recommended_anime else 0)

    # Optional: Print lengths to debug
    if len(y_true) != len(y_pred):
        print(f"Length mismatch at index {index}: y_true = {len(y_true)}, y_pred = {len(y_pred)}")
        break

Length mismatch at index 6329: y_true = 2461, y_pred = 2460


In [None]:
# Check if the lengths match before evaluation
print(f"Final lengths: y_true = {len(y_true)}, y_pred = {len(y_pred)}")

Final lengths: y_true = 2461, y_pred = 2460


In [None]:
# Only proceed if lengths match
if len(y_true) == len(y_pred):
    # Calculate Precision, Recall, and F1-Score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
else:
    print("Error: Mismatch in the lengths of y_true and y_pred. Please check the data.")

Error: Mismatch in the lengths of y_true and y_pred. Please check the data.
