In [1]:
import os
from bs4 import BeautifulSoup
import requests
import time
from datetime import datetime
import json
from zipfile import ZipFile
import pandas as pd
import shutil
import zipfile
import random
from kafka import KafkaProducer

In [2]:
BASE_PATH = "craw"
HTML_PATH = BASE_PATH + "/html"
USER_PATH = BASE_PATH + "/users"

## Producer

In [3]:
def serializer(message):
    return json.dumps(message).encode('utf-8')
# Thêm địa chỉ của tất cả các broker vào danh sách dưới đây
bootstrap_servers = ['localhost:9092', 'localhost:9093', 'localhost:9094']

# Key serializer
def key_serializer(key):
    return str(key).encode('utf-8')



# Kafka Producer with custom partitioner
producer = KafkaProducer(
    bootstrap_servers=bootstrap_servers,
    value_serializer=serializer,
    key_serializer=key_serializer,
)


# producer.send('user', value=dummy_message, key = random.choice(range(0,5)))


## Create anime.tsv

In [4]:
def extract_zip(input_zip):
    input_zip = ZipFile(input_zip)
    return {name: input_zip.read(name) for name in input_zip.namelist()}

KEYS = ['MAL_ID', 'Name', 'Score', 'Genders', 'English name', 'Japanese name', 'Type', 'Episodes',
        'Aired', 'Premiered', 'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
        'Ranked', 'Popularity', 'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
        'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6', 'Score-5', 'Score-4',
        'Score-3', 'Score-2', 'Score-1']

In [5]:
def get_name(info):
    try:
        return info.find("h1", {"class": "title-name h1_bold_none"}).text.strip()
    except: 
        return ""

def get_english_name(info):
    try:
        span = info.findAll("span", {"class": "dark_text"})
        return span.parent.text.strip()
    except:
        return ""

def get_table(a_soup):
    try:
        return a_soup.find("div", {"class": "po-r js-statistics-info di-ib"})
    except:
        return ""

def get_score(stats):
    try:
        score = stats.find("span", {"itemprop": "ratingValue"})
        if score is None:
            return "Unknown"
        return score.text.strip()
    except:
        return ""

def get_gender(sum_info):
    try:
        text = ", ".join(
            [x.text.strip() for x in sum_info.findAll("span", {"itemprop": "genre"})]
        )
        return text
    except:
        return ""

def get_description(sum_info):
    try:
        return sum_info.find("td", {"class": "borderClass", "width": "225"})
    except:
        return ""

def get_all_stats(soup):
    try:
        return soup.find("div", {"id": "horiznav_nav"}).parent.findAll(
            "div", {"class": "spaceit_pad"}
        )
    except:
        return ""

def get_info_anime(anime_id):
    data = extract_zip(f"craw/html/{anime_id}.zip")
    anime_info = data["stats.html"].decode()
    soup = BeautifulSoup(anime_info, "html.parser")

    stats = get_table(soup)
    description = get_description(soup)
    anime_info = {key: "Unknown" for key in KEYS}

    anime_info["MAL_ID"] = anime_id
    anime_info["Name"] = get_name(soup)
    anime_info["Score"] = get_score(stats)
    anime_info["Genders"] = get_gender(description)

    for d in description.findAll("span", {"class": "dark_text"}):
        information = [x.strip().replace(" ", " ") for x in d.parent.text.split(":")]
        category, value = information[0], ":".join(information[1:])
        value.replace("\t", "")

        if category in ["Broadcast", "Synonyms", "Genres", "Score", "Status"]:
            continue

        if category in ["Ranked"]:
            value = value.split("\n")[0]
        if category in ["Producers", "Licensors", "Studios"]:
            value = ", ".join([x.strip() for x in value.split(",")])
        if category in ["Ranked", "Popularity"]:
            value = value.replace("#", "")
        if category in ["Members", "Favorites"]:
            value = value.replace(",", "")
        if category in ["English", "Japanese"]:
            category += " name"

        anime_info[category] = value

    # Stats (Watching, Completed, On-Hold, Dropped, Plan to Watch)
    for d in get_all_stats(soup)[:5]:
        category, value = [x.strip().replace(" ", " ") for x in d.text.split(":")]
        value = value.replace(",", "")
        anime_info[category] = value

    # Stast votes per score
    for d in get_all_stats(soup)[6:]:
        score = d.parent.parent.find("td", {"class": "score-label"}).text.strip()
        value = [x.strip().replace(" ", " ") for x in d.text.split("%")][1].strip(
            "(votes)"
        )
        label = f"Score-{score}"
        anime_info[label] = value.strip()

    for key, value in anime_info.items():
        if str(value) in ["?", "None found, add some", "None", "N/A", "Not available"]:
            anime_info[key] = "Unknown"
    return anime_info

In [6]:
get_info_anime(16)

{'MAL_ID': 16,
 'Name': 'Hachimitsu to Clover',
 'Score': '',
 'Genders': 'Comedy, Drama, Romance, Adult Cast, Love Polygon, Visual Arts, Josei',
 'English name': 'Honey and Clover',
 'Japanese name': 'ハチミツとクローバー',
 'Type': 'TV',
 'Episodes': '24',
 'Aired': 'Apr 15, 2005 to Sep 27, 2005',
 'Premiered': 'Spring 2005',
 'Producers': 'Dentsu, Genco, Fuji TV, Asmik Ace, Shueisha',
 'Licensors': 'VIZ Media, Discotek Media',
 'Studios': 'J.C.Staff',
 'Source': 'Manga',
 'Duration': '23 min. per ep.',
 'Rating': 'PG-13 - Teens 13 or older',
 'Ranked': '616\r',
 'Popularity': '882',
 'Members': '265142',
 'Favorites': '4164',
 'Watching': '15150',
 'Completed': '93650',
 'On-Hold': '14028',
 'Dropped': '13551',
 'Plan to Watch': '128763',
 'Score-10': '12725',
 'Score-9': '17896',
 'Score-8': '23208',
 'Score-7': '15579',
 'Score-6': '6791',
 'Score-5': '3702',
 'Score-4': '1605',
 'Score-3': '582',
 'Score-2': '337',
 'Score-1': '413',
 'German': 'Honey and Clover',
 'Spanish': 'Honey and Cl

In [7]:
# Generate anime.tsvdf.to_csv(f"{BASE_PATH}/anime.tsv", index=False, sep="\t", encoding="UTF-8")

anime_revised = set()
exist_file = os.path.exists(f"{BASE_PATH}/anime.tsv")
actual_data = pd.DataFrame()
if exist_file:
    # If the file exist, include new data.
    actual_data = pd.read_csv(f"{BASE_PATH}/anime.tsv", sep="\t")
    anime_revised = list(actual_data.MAL_ID.unique())

actual_data.head()
total_data = []
zips = os.listdir(HTML_PATH)
for i, anime in enumerate(zips):
    if not ".zip" in anime:
        continue

    anime_id = int(anime.strip(".zip"))

    if int(anime_id) in anime_revised:
        continue

    print(f"\r{i+1}/{len(zips)} ({anime_id})", end="")

    anime_id = anime.strip(".zip")
    info = get_info_anime(anime_id)
    producer.send('anime_tsv', value=info, key = anime_id)
    total_data.append(info)

if len(total_data):
    df = pd.DataFrame.from_dict(total_data)
    df["MAL_ID"] = pd.to_numeric(df["MAL_ID"])
    df = df.sort_values(by="MAL_ID").reset_index(drop=True)

    if exist_file:
        df = (
            pd.concat([actual_data, df]).sort_values(by="MAL_ID").reset_index(drop=True)
        )

else:
    df = actual_data

pd.set_option("display.max_columns", None)
df.head()

292/292 (9673))

Unnamed: 0,MAL_ID,Name,Score,Genders,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,Theme,Demographic,Themes,Genre,Spanish,French,German,Demographics,Previewed
0,16,Hachimitsu to Clover,,"Comedy, Drama, Romance, Adult Cast, Love Polyg...",Honey and Clover,ハチミツとクローバー,TV,24,"Apr 15, 2005 to Sep 27, 2005",Spring 2005,"Dentsu, Genco, Fuji TV, Asmik Ace, Shueisha","VIZ Media, Discotek Media",J.C.Staff,Manga,23 min. per ep.,PG-13 - Teens 13 or older,616\r,882,265142,4164,15150,93650,14028,13551,128763,12725,17896,23208,15579,6791,3702,1605,582,337,413,,Josei\r\n \n\r\n Josei,Adult Cast\r\n \n\r\n A...,,Honey and Clover,Honey and Clover,Honey and Clover,,
1,25,Sunabouzu,,"Action, Adventure, Comedy, Sci-Fi, Ecchi, Seinen",Desert Punk,砂ぼうず,TV,24,"Oct 6, 2004 to Mar 30, 2005",Fall 2004,"GDH, Pony Canyon, CBC Television",Funimation,Gonzo,Manga,24 min. per ep.,R - 17+ (violence & profanity),2266\r,1588,137798,838,8200,62924,7223,9588,49863,4034,7269,14009,14913,7275,3658,1796,752,414,366,,Seinen\r\n \n\r\n Seinen,,,Desert Punk,Desert Punk,Desert Punk,,
2,47,Akira,,"Action, Adventure, Horror, Sci-Fi, Supernatura...",Akira,AKIRA（アキラ）,Movie,1,"Jul 16, 1988",Unknown,"Mainichi Broadcasting System, Kodansha, TOHO, ...","Funimation, Bandai Entertainment, Geneon Enter...",Tokyo Movie Shinsha,Manga,2 hr. 4 min.,R+ - Mild Nudity,408\r,209,832735,13493,14596,628518,5488,3349,180784,108823,119202,135075,80436,33122,13877,6434,2573,1453,1497,,Seinen\r\n \n\r\n Seinen,Gore\r\n \n\r\n Gore\r\n ...,,,,,,
3,75,Soukyuu no Fafner: Dead Aggressor,,"Action, Drama, Sci-Fi, Mecha, Military",Fafner,蒼穹のファフナーDead Aggressor,TV,25,"Jul 5, 2004 to Dec 27, 2004",Summer 2004,"TV Tokyo Music, King Records","Funimation, Geneon Entertainment USA",Xebec,Original,24 min. per ep.,PG-13 - Teens 13 or older,2861\r,3083,48028,267,2786,18716,2118,2301,22107,1038,1852,3674,4300,2087,1118,493,210,104,96,,,Mecha\r\n \n\r\n Mecha\...,,,,,,
4,143,Kannazuki no Miko,,"Drama, Girls Love, Supernatural, Mecha, Shounen",Destiny of the Shrine Maiden,神無月の巫女,TV,12,"Oct 2, 2004 to Dec 18, 2004",Fall 2004,"Studio Fantasia, Geneon Universal Entertainmen...","Sentai Filmworks, Geneon Entertainment USA",TNK,Manga,24 min. per ep.,PG-13 - Teens 13 or older,4966\r,2500,70528,717,2953,37980,2274,3444,23877,2690,2953,5140,7216,5228,3327,1819,875,508,385,Mecha\r\n \n\r\n Mecha,Shounen\r\n \n\r\n Shounen,,,,Destiny of Shrine Maiden,,,


In [8]:
df.to_csv(f"{BASE_PATH}/anime.tsv", index=False, sep="\t", encoding="UTF-8")

## Create rating_complete.csv
This file only contain animes with watching_status==2(complete) and have been rated (score!=0).

In [9]:
if not os.path.exists(f"{BASE_PATH}/rating_complete.csv"):
    with open(f"{BASE_PATH}/rating_complete.csv", "w", encoding="UTF-8") as file:
        file.write("user_id,anime_id,rating\n")

In [10]:
unique_anime = set()
all_users = sorted(os.listdir(USER_PATH), key=lambda x:int(x.split(".")[0]))

with open(f"{BASE_PATH}/rating_complete.csv", "a") as f1:

    for i, user_file in enumerate(all_users):
        if not user_file.endswith(".csv"):
            continue

        print(f"\r{i+1}/{len(all_users)}", end="")

        user_id = user_file.split(".")[0]
        with open(f"{USER_PATH}/{user_file}", "r") as file:
            file.readline()
            for line in file:
                anime_id, score, watching_status, _ = line.strip().split(",")
                if int(watching_status) == 2 and (score) != 0:
                    temp = f"{user_id},{anime_id},{score}\n"
                    producer.send('rating_complete', value=temp)
                    f1.write(temp)

1144/1144

## Create animelist.csv

In [11]:
if not os.path.exists(f"{BASE_PATH}/animelist.csv"):
    with open(f"{BASE_PATH}/animelist.csv", "w", encoding="UTF-8") as file:
        file.write("user_id,anime_id,rating,watching_status,watched_episodes\n")

In [12]:
unique_anime = set()
all_users = sorted(os.listdir(USER_PATH), key=lambda x:int(x.split(".")[0]))

with open(f"{BASE_PATH}/animelist.csv", "a") as f1:

    for i, user_file in enumerate(all_users):
        if not user_file.endswith(".csv"):
            continue

        print(f"\r{i+1}/{len(all_users)}", end="")

        user_id = user_file.split(".")[0]
        with open(f"{USER_PATH}/{user_file}", "r") as file:
            file.readline()
            for line in file:
                anime_id, score, watching_status, watched_episodes = line.strip().split(",")
                temp = f"{user_id},{anime_id},{score},{watching_status},{watched_episodes}\n"
                producer.send('animelist_csv', value=temp, key = anime_id)
                f1.write(temp)

1144/1144