In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import ast
import re

In [2]:
HOME = os.getcwd()
print(HOME)
ANIME_DATA = os.path.join(HOME, r'Preprocessing\Preprocessed Data\raw_anime_processed.csv')
print(ANIME_DATA)

d:\Data Science Introduction\Final_Project\models\anime_models
d:\Data Science Introduction\Final_Project\models\anime_models\Preprocessing\Preprocessed Data\raw_anime_processed.csv


In [3]:
df = pd.read_csv(r'D:\Data Science Introduction\Final_Project\Preprocessing\Preprocessed Data\raw_anime_processed.csv')
df.head()

Unnamed: 0,Title,Score,Vote,Ranked,Popularity,Members,Favorite,Types,Volumes,Chapters,...,Demographic,Serialization,Author,Total Review,Type Review,Realeased date,Completed date,Recommended,Mixed Feelings,Not Recommended
0,Berserk,9.47,363720,1,1,725079,130489,Manga,3,15,...,Seinen,Young Animal,"[""'Miura,Kentarou'"", ""'Studio Gaga'""]",289,"[252, 17, 20]",1989-08-25,Updating,252,17,20
1,JoJo no Kimyou na Bouken Part 7: Steel Ball Ru...,9.31,172219,2,23,280428,46269,Manga,24,96,...,Seinen,Ultra Jump,"[""'Araki,Hirohiko'""]",131,"[123, 7, 1]",2004-01-19,2011-04-19,123,7,1
2,Vagabond,9.26,154583,3,13,406082,44258,Manga,37,327,...,Seinen,Morning,"[""'Inoue,Takehiko'"", ""'Yoshikawa,Eiji'""]",104,"[93, 9, 2]",1998-09-03,2015-05-21,93,9,2
3,One Piece,9.22,392811,4,4,642620,119974,Manga,3,15,...,Shounen,Shounen Jump (Weekly),"[""'Oda,Eiichiro'""]",231,"[190, 21, 20]",1997-07-22,Updating,190,21,20
4,Monster,9.16,104327,5,29,258581,22008,Manga,18,162,...,Seinen,Big Comic Original,"[""'Urasawa,Naoki'""]",86,"[69, 11, 6]",1994-12-05,2001-12-20,69,11,6


Normalize attributes

In [4]:
df['Genres'] = df['Genres'].fillna(str(['''''']))
df['Themes'] = df['Themes'].fillna(str(['''''']))

# Convert string representation of lists into actual lists
df['Genres'] = df['Genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
df['Themes'] = df['Themes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
df['Author'] = df['Author'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Clean genres and themes
df['Genres'] = df['Genres'].apply(lambda x: [genre.strip("'") for genre in x])
df['Themes'] = df['Themes'].apply(lambda x: [theme.strip("'") for theme in x])
df['Author'] = df['Author'].apply(lambda x: [author.strip("'") for author in x])

# # Chuyển các thuộc tính dạng list thành chuỗi văn bản
# df['Genres_str'] = df['Genres'].apply(lambda x: " ".join(genre.strip() for genre in x))
# df['Themes_str'] = df['Themes'].apply(lambda x: " ".join(theme.strip() for theme in x))
# df['Author_str'] = df['Author'].apply(lambda x: " ".join(author.strip() for author in x))

Normalize number attributes

In [5]:
df.keys()

Index(['Title', 'Score', 'Vote', 'Ranked', 'Popularity', 'Members', 'Favorite',
       'Types', 'Volumes', 'Chapters', 'Status', 'Published', 'Genres',
       'Themes', 'Demographic', 'Serialization', 'Author', 'Total Review',
       'Type Review', 'Realeased date', 'Completed date', 'Recommended',
       'Mixed Feelings', 'Not Recommended'],
      dtype='object')

Features extraction

In [6]:
def parse_query(query):
    filters = {}

    # Extract genres
    genres_match = re.search(r"genre[s]* ['\"](.*?)['\"]", query)
    if genres_match:
        filters['Genres'] = [g.strip() for g in genres_match.group(1).split(",")]

    # Extract themes
    themes_match = re.search(r"theme[s]* ['\"](.*?)['\"]", query)
    if themes_match:
        filters['Themes'] = [t.strip() for t in themes_match.group(1).split(",")]

    # Extract types
    types_match = re.search(r"type[s]* ['\"](.*?)['\"]", query)
    if types_match:
        filters['Types'] = [t.strip() for t in types_match.group(1).split(",")]

    # Extract author
    author_match = re.search(r"author[s]* ['\"](.*?)['\"]", query)
    if author_match:
        filters['Authors'] = [t.strip() for t in author_match.group(1).split(",")]

    return filters


In [7]:
def score_anime(anime, filters):
    score = 0
    max_score = 0  # Tracks the maximum possible score for the anime

    # Match genres
    if 'Genres' in filters:
        matched_genres = set(anime['Genres']).intersection(filters['Genres'])
        score += len(matched_genres) * 2  # Weight: 2 per matching genre
        max_score += len(filters['Genres']) * 2  # Max score for genres

    # Match authors
    if 'Authors' in filters:
        matched_themes = set(anime['Author']).intersection(filters['Authors'])
        score += len(matched_themes) * 1.75  # Weight: 1.75 per matching author
        max_score += len(filters['Authors']) * 1.75  # Max score for themes


    # Match themes
    if 'Themes' in filters:
        matched_themes = set(anime['Themes']).intersection(filters['Themes'])
        score += len(matched_themes) * 1.5  # Weight: 1.5 per matching theme
        max_score += len(filters['Themes']) * 1.5  # Max score for themes


    # Match type
    if 'Type' in filters:
        max_score += 3  # Add weight to the max score
        if anime['Type'] == filters['Type']:
            score += 3


    # Normalize the score to be between 0.0 and 1.0
    return score / max_score if max_score > 0 else 0

In [8]:
# Example
query = "I want to see an anime which has genre 'Adventure' and 'Action', and it has theme 'Adult Cast' and it has authors 'Asano' and type 'Novel'"
filters = parse_query(query)

df['Relevance_Score'] = df.apply(lambda x: score_anime(x, filters), axis=1)
df = df.sort_values(by='Relevance_Score', ascending=False)

# Display the results
print(df[['Title', 'Relevance_Score']].head())

                                               Title  Relevance_Score
18344  Asamiya-san no Imouto (Sister of Ms. Asamiya)         0.714286
345                                   Trigun Maximum         0.666667
201                                    Solo Leveling         0.666667
755                                      Log Horizon         0.666667
3309                                           Cobra         0.666667
