In [1]:
import os
import re
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from difflib import get_close_matches

# Put your CSV in same folder or change this path:
DATA_PATH = 'anime-dataset-2023.csv'
print('Looking for dataset at:', DATA_PATH)
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Put the CSV in the notebook folder or update DATA_PATH.")

df = pd.read_csv(DATA_PATH)
print('Loaded dataset with shape:', df.shape)
df.head()


Looking for dataset at: anime-dataset-2023.csv
Loaded dataset with shape: (24905, 24)


Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,"Sep 1, 2001",...,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",...,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,"Jul 3, 2002 to Dec 25, 2002",...,Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,"Sep 30, 2004 to Sep 29, 2005",...,Toei Animation,Manga,23 min per ep,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...


In [2]:
# Preprocessing helpers
def simple_clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ''
    s = s.strip()
    s = s.replace('/', ' ').replace('|', ' ')
    s = re.sub(r'\s+', ' ', s)
    return s

def preprocess_genres(val) -> str:
    if pd.isna(val):
        return ''
    if isinstance(val, (list, tuple)):
        return ' '.join([str(x).strip().replace(' ', '_') for x in val if x])
    s = str(val)
    parts = re.split(r'[,/|;]+', s)
    parts = [p.strip().replace(' ', '_') for p in parts if p.strip()]
    return ' '.join(parts)

def build_combined_text(df, text_cols):
    def join_row(row):
        parts = []
        for c in text_cols:
            if c not in row or pd.isna(row[c]):
                continue
            if c.lower() in ('genre', 'genres'):
                parts.append(preprocess_genres(row[c]))
            else:
                parts.append(simple_clean_text(str(row[c])))
        return ' '.join(parts).strip()
    return df.apply(join_row, axis=1)

# Detect useful columns
cols_lower = {c.lower(): c for c in df.columns}
title_col = None
for cand in ['name','title','english name','anime','anime title','Name']:
    if cand in cols_lower:
        title_col = cols_lower[cand]
        break
if not title_col:
    for c in df.columns:
        if df[c].dtype == object:
            title_col = c
            break

text_cols = [title_col] if title_col else []
for c in ['english name','other name','genres','genre','type','synopsis','plot','description','overview']:
    if c in cols_lower:
        text_cols.append(cols_lower[c])
if len(text_cols) <= 1:
    text_cols = df.select_dtypes(include=['object']).columns.tolist()

print('Using title col:', title_col)
print('Text columns:', text_cols)

df['combined_text'] = build_combined_text(df, text_cols)
df['combined_text'].str[:400].head()


Using title col: Name
Text columns: ['Name', 'English name', 'Other name', 'Genres', 'Type', 'Synopsis']


0    Cowboy Bebop Cowboy Bebop カウボーイビバップ Action Awa...
1    Cowboy Bebop: Tengoku no Tobira Cowboy Bebop: ...
2    Trigun Trigun トライガン Action Adventure Sci-Fi TV...
3    Witch Hunter Robin Witch Hunter Robin Witch Hu...
4    Bouken Ou Beet Beet the Vandel Buster 冒険王ビィト A...
Name: combined_text, dtype: object

In [3]:
# Build TF-IDF and NearestNeighbors
MAX_FEATURES = 20000
NGRAM_RANGE = (1,2)

tfidf = TfidfVectorizer(stop_words='english', max_features=MAX_FEATURES, ngram_range=NGRAM_RANGE)
tfidf_matrix = tfidf.fit_transform(df['combined_text'].fillna('').astype(str).tolist())
print('TF-IDF matrix shape:', tfidf_matrix.shape)

nn = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute', n_jobs=-1)
nn.fit(tfidf_matrix)
print('NearestNeighbors fitted')


TF-IDF matrix shape: (24905, 20000)
NearestNeighbors fitted


In [4]:
# Title mapping and recommendation function
if title_col and title_col in df.columns:
    titles = df[title_col].astype(str)
else:
    titles = df.index.astype(str)
title_to_index = pd.Series(df.index.values, index=titles).drop_duplicates()

def find_best_title_match(query_title, cutoff=0.6):
    if query_title in title_to_index.index:
        return query_title
    for t in title_to_index.index:
        if t.lower() == query_title.lower():
            return t
    matches = get_close_matches(query_title, list(title_to_index.index), n=5, cutoff=cutoff)
    if matches:
        return matches[0]
    contains = [t for t in title_to_index.index if query_title.lower() in t.lower()]
    if contains:
        return contains[0]
    raise ValueError(f"Title '{query_title}' not found")

def get_recommendations(title, topn=10):
    best = find_best_title_match(title)
    idx = int(title_to_index[best])
    distances, indices = nn.kneighbors(tfidf_matrix[idx], n_neighbors=topn+1)
    distances = distances.flatten()[1:]
    indices = indices.flatten()[1:]
    sims = 1 - distances
    res = df.iloc[indices].copy().reset_index(drop=True)
    res['similarity'] = sims
    cols = []
    if title_col and title_col in df.columns:
        cols.append(title_col)
    for c in ['genres','genre','type','synopsis','plot','description','English name','Other name']:
        if c in res.columns and c not in cols:
            cols.append(c)
    cols = cols[:6] + ['similarity']
    cols = [c for c in cols if c in res.columns]
    return res[cols]

print('Ready to query. Example: get_recommendations("Cowboy Bebop", topn=5)')


Ready to query. Example: get_recommendations("Cowboy Bebop", topn=5)


In [5]:
# Example queries (modify titles to ones in your dataset)
examples = ['Cowboy Bebop']
for t in examples:
    try:
        print('\n=== Recs for:', t)
        print(get_recommendations(t, topn=5).to_string(index=False))
    except Exception as e:
        print('Error for', t, ':', e)



=== Recs for: Cowboy Bebop
                                Name                               English name          Other name  similarity
     Cowboy Bebop: Tengoku no Tobira                    Cowboy Bebop: The Movie      カウボーイビバップ 天国の扉    0.447988
    Cowboy Bebop: Ein no Natsuyasumi        Cowboy Bebop: Ein's Summer Vacation           アインのなつやすみ    0.352951
     Cowboy Bebop: Yose Atsume Blues Cowboy Bebop: Session XX - Mish-Mash Blues カウボーイビバップ よせあつめブルース    0.324989
Mutant Turtles: Choujin Densetsu-hen                                    UNKNOWN  ミュータント タートルズ 超人伝説編    0.164634
                   Waga Na wa Cowboy                                    UNKNOWN           わが名はカウボーイ    0.147143


In [7]:
# Save artifacts
ARTIFACT_PATH = 'anime_recommender_artifacts.pkl'
with open(ARTIFACT_PATH, 'wb') as f:
    pickle.dump({'tfidf': tfidf, 'nn': nn, 'title_col': title_col}, f)
print('Saved artifacts to', ARTIFACT_PATH)


Saved artifacts to anime_recommender_artifacts.pkl
