In [58]:
import pathlib
import os
import pandas as pd
import numpy as np
from sklearn.metrics.cluster import normalized_mutual_info_score

ANIME_CLASSES_PATH = r"E:\ハウス\Assignment\Kmitl\Year_3\Semester_1\anime_recommendation_api\api\data-sets\Anime_classes.csv"
ANIME_NAMES_PATH = r"E:\ハウス\Assignment\Kmitl\Year_3\Semester_1\anime_recommendation_api\api\data-sets\Anime_names.csv"
ANIME_SCORE_PATH = r"E:\ハウス\Assignment\Kmitl\Year_3\Semester_1\anime_recommendation_api\api\data-sets\Anime_score.csv"

In [59]:
anime_classes = pd.read_csv(ANIME_CLASSES_PATH, index_col=0)
anime_names = pd.read_csv(ANIME_NAMES_PATH, index_col=0)
anime_score = pd.read_csv(ANIME_SCORE_PATH, index_col=0)
anime_classes

Unnamed: 0_level_0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6661,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6662,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6663,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
anime_names.iloc[:, 0]

id
2                                        Seto no Hanayome
3                                      Shugo Chara!! Doki
4                                           Princess Tutu
5                                     Bakuman. 3rd Season
6                                     Yume-iro Pâtissière
                              ...                        
6660                              Dokidoki Little Ooyasan
6661                          Wo Shi Jiang Xiaobai (2018)
6662                Genki Genki Non-tan: Obake Mura Meiro
6663                                  Mr. Men Little Miss
6664    Mushi Mushi Mura no Nakama-tachi: Minna Ii Tok...
Name: Inu x Boku SS, Length: 6663, dtype: object

In [60]:
classes = anime_classes.columns
classes

Index(['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama',
       'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror',
       'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music',
       'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai',
       'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen',
       'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power',
       'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri'],
      dtype='object')

In [61]:
# model
def predict(predict_classes:np.ndarray, n:int=5, return_only_titles:bool=False):
    """ 
    predicts the anime name based on the classes using cosine similarity
    inputs predict_classes: vectorized classes contain value (0 or 1), length=len(anime_classes.columns)
            n: length of output to return. if n=-1, return all
    return: if return_only_titles=false return dictionary of anime names, id and similarity score, length=n sorted by similarity score(reverted order).
            else return list of anime names length=n sorted by similarity score(reverted order).
    """

    # apply Cosine Similarity.
    similarity = {
        "id" : anime_classes.index,
        "value" : np.dot(anime_classes.values, predict_classes) / np.linalg.norm(anime_classes.values) * np.linalg.norm(predict_classes)
    }

    #sort similarity with reverted order.
    similarity["value"] = similarity["value"] * anime_score["score"] / 10
    similarity["id"] = np.argsort(similarity["value"])[::-1] + 1
    similarity["value"] = np.sort(similarity["value"])[::-1]
    
    #select top n value.
    if n != -1:
        similarity["id"] =  similarity["id"][:n]
        similarity["value"] =  similarity["value"][:n]
    
    #get title of anime.
    title = anime_names["title"].loc[similarity["id"]].values

    if return_only_titles:
        return title
    
    else:
        return {
            "title" : title,
            "id" : similarity["id"],
            "similarity" : similarity["value"]
        }

In [62]:
def vectorize_classes(inp_classes:list):
    """ 
    vectorize the classes to be used in predict function
    if input not in classes, it will be ignored.
    inputs classes: list of string of anime classes. such as ["Action", "Adventure", "Comedy"]
    return: vectorized classes contain value (0 or 1), length=len(anime_classes.columns)
    """
    title = anime_classes.columns
    vectorize = np.zeros(len(classes),dtype=np.uint8)

    for inp in inp_classes:
        vectorize[title==inp.capitalize()] = 1
    return vectorize

In [63]:
vectorize_classes(["Action", "Adventure", "Comedy","hellovwasdf jasdiof"])

array([1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=uint8)

In [64]:
def tokenizer(input:str, sep:str=","):
    """
    return list of string from input string seperated by sep
    """
    list = input.split(sep)
    
    # #remove space in last and first.
    list = [ ele.strip() for ele in list]
        
    return list

In [65]:
tokenizer(input="Action,   Adventure ,         Comedy     ,@ellovwasdf jasdiof", sep=",")

['Action', 'Adventure', 'Comedy', '@ellovwasdf jasdiof']

In [66]:
predict(vectorize_classes(tokenizer(input="Action,   Adventure ,         Comedy     ,@ellovwasdf jasdiof", sep=",")))


{'title': array(['Fullmetal Alchemist: Brotherhood', 'Cowboy Bebop',
        'Tengen Toppa Gurren Lagann',
        'JoJo no Kimyou na Bouken: Diamond wa Kudakenai', 'One Piece'],
       dtype=object),
 'id': id
 6664    1262
 6663    4357
 6662    5235
 6661    1416
 6660      35
 Name: score, dtype: int64,
 'similarity': array([0.03119093, 0.02970726, 0.02947122, 0.0291003 , 0.02879682])}

In [94]:
anime_names.loc(1)

<pandas.core.indexing._LocIndexer at 0x26f78301c60>

In [90]:
print(anime_names.loc[anime_names[0] == 35])

KeyError: 0

In [68]:
test = "action"
test2 = test.split(', ')
print(test2)
tokenizer(input=test)

['action']


['action']

In [69]:
# genre_check = 
# predict(), return_only_titles=True)

# predict(input=vectorize_classes(), return_only_titles=True)
# type(tokenizer(input="Action, comedy", sep=","))
# type(vectorize_classes(tokenizer(input="Action, comedy", sep=",")))
predict(vectorize_classes(tokenizer(input="Action, comedy", sep=",")), return_only_titles=True)

array(['Fullmetal Alchemist: Brotherhood', 'Gintama°', 'Gintama&#039;',
       'Gintama&#039;: Enchousen',
       'Gintama Movie 2: Kanketsu-hen - Yorozuya yo Eien Nare'],
      dtype=object)

In [70]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [71]:
vectorize = CountVectorizer()


In [72]:
def get_anime_id_by_name(name:str):
    """
    get anime id by name
    """
    return anime_names.index[anime_names["title"]==name].values[0]



In [73]:
### Main Function ------------
def predict_anime_by_description(description:str, n:int=5, return_only_titles:bool=False):
    """
    predicts the anime name based on the description.
    inputs description: string of anime description.
            n: length of output to return. if n=-1, return all
    return: if return_only_titles=false return dictionary of anime names, id and similarity score, length=n sorted by similarity score(reverted order).
            else return list of anime names length=n sorted by similarity score(reverted order).
    """
    predict_output = predict(vectorize_classes(tokenizer(description)), n, return_only_titles)
    
    return predict_output


def predict_anime_by_name(name:str, n:int=5, return_only_titles:bool=False):
    """
    predicts the anime name based on the classes using cosine similarity
    inputs name: string of anime name.
            n: length of output to return. if n=-1, return all
    return: if return_only_titles=false return dictionary of anime names, id and similarity score, length=n sorted by similarity score(reverted order).
            else return list of anime names length=n sorted by similarity score(reverted order).
    """
    id = get_anime_id_by_name(name)
    predict_output = predict(anime_classes.loc[id], n+1, return_only_titles)
    
    # prevent return the same of input anime.
    if not return_only_titles:
        for key, pred in predict_output.items():
            predict_output[key] = pred[1:]
    else:
        predict_output = predict_output[1:]
        
    return  predict_output 



In [74]:
print(predict_anime_by_description("Action"))
print(predict_anime_by_name("Inu x Boku SS"))

{'title': array(['Fullmetal Alchemist: Brotherhood', 'Gintama°', 'Gintama&#039;',
       'Hunter x Hunter (2011)', 'Gintama&#039;: Enchousen'], dtype=object), 'id': id
6664    1262
6663     640
6662    4283
6661     224
6660    2774
Name: score, dtype: int64, 'similarity': array([0.0060027 , 0.0059378 , 0.00591185, 0.00591185, 0.00588589])}
{'title': array(['Kami nomi zo Shiru Sekai: Megami-hen',
       'Kami nomi zo Shiru Sekai II', 'InuYasha',
       'InuYasha: Tenka Hadou no Ken', 'Kami nomi zo Shiru Sekai'],
      dtype=object), 'id': id
6663    5525
6662    2668
6661      39
6660    1582
6659    4892
Name: score, dtype: int64, 'similarity': array([0.04267432, 0.04189559, 0.04101303, 0.0409092 , 0.0409092 ])}


In [75]:
test = anime_classes * anime_score / 10

In [76]:
test.head()

Unnamed: 0_level_0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [77]:
df = pd.DataFrame([1, 2, 3])
df2 = pd.DataFrame([3, 2, 1])


In [78]:
import requests
from bs4 import BeautifulSoup

In [79]:
word = "chainsaw man"
url = "https://www.google.com/search?q={0}&tbm=isch".format(word)
content = requests.get(url).content
soup = BeautifulSoup(content, 'lxml')
images = soup.findAll('img')

In [80]:
images[1]

<img alt="" class="yWs4tf" src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTglL7MuGw4tj9I8S8rHHuDVyQ0OvmvLWC8Y5gxnAjZNXpRntrO5iqIdbm2bw&amp;s"/>

In [81]:
for image in images:
    print(image.get('src'))

/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTglL7MuGw4tj9I8S8rHHuDVyQ0OvmvLWC8Y5gxnAjZNXpRntrO5iqIdbm2bw&s
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRSkt3xXrZJdisuUOv5KaMta-zZjtji8LdmF9jh6h0jhNJKQOWX15DzhOxmuw&s
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT5Byk3aeolN6i9lQo0gvLetK7iBUpbd8F_asNKYwHPlZekc_P3-8RlyLQ-VFk&s
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTxrXyzIiOrR8bXLpkCY7RP7htejNVgR0mWu9XHrIq-zXjOuA7AYnBHuZie6g&s
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR1cC2SPVIWwxB2qoS6nqq1jTZ6DR6eusidR5FxjWrdW9f0Pu_rISHVCVAVduI&s
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR6uAdBC6uYZst-OTA06g83oYezBE9RtJMDfDD6z4EAD2Cn5NbLc1ndlJxr8Q&s
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTcV5JF3b_7Xb3VCqDGNYW93nPDu53quddm_HTvZ8o8pb3Tl57bCfWYAtGRHyo&s
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTh542Wg4nBBk_RWE39lZMogpG_P_k96w8XCFV-CaVCX3vLEYC