In [None]:
from bs4 import BeautifulSoup
import requests
import re

from multiprocessing import Pool

import numpy as np
import pandas as pd

In [None]:
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}

# Collect all tags

In [None]:
url = "https://bangumi.tv/anime/tag?page="

tags = []
for idx in range(1, 9):
  res = requests.get(url+str(idx), headers=header)
  res.encoding = 'utf-8'
  soup = BeautifulSoup(res.text, "html.parser")
  tags += soup.find('div', {'id': 'tagList'}).findChildren('a')

tags = [t.text for t in tags[:500]]

In [None]:
TAGS = list(filter(lambda t: re.search(r'\d{4}', t) is None, tags))

In [None]:
print(len(TAGS))

500


# Collect all subjects

In [None]:
def get_per_item(item_id, year):
  url = "https://bangumi.tv/subject/" + item_id
  res = requests.get(url, headers=header)
  res.encoding = 'utf-8'
  soup = BeautifulSoup(res.text, "html.parser")
  item_tags = soup.find('div', class_='subject_tag_section')
  if item_tags is None:
    return
  item_tags = item_tags.findChildren('a')
  item_tag_dic = {}
  for tag in item_tags:
    if tag.attrs['href'] == 'javascript:void(0)':
      continue
    if tag.contents[0].text not in TAGS:
      continue
    item_tag_dic[tag.contents[0].text] = tag.contents[2].text

  if len(item_tag_dic) > 0:
    score = soup.find('div', class_='global_score').find('span', {'class': 'number'}).text
    if(score == '0.0'):
      return
    item_tag_dic['score'] = score
    item_tag_dic['id'] = item_id
    item_tag_dic['year'] = year
    item_tag_dic['votes'] = soup.find('span', {'property': 'v:votes'}).text

    return item_tag_dic

In [None]:
def get_per_page(page_num, year):
  url = "https://bangumi.tv/anime/browser/airtime/"
  res = requests.get(f"{url}{year}?page={page_num}", headers=header)
  res.encoding = 'utf-8'
  soup = BeautifulSoup(res.text, "html.parser")
  items = soup.find('ul', {'id': 'browserItemList'}).findChildren('li')

  all_res = []

  for item in items:
    item_id = item.find("a", {"class": "subjectCover cover ll"})["href"][9:]
    res = get_per_item(item_id, year)
    if res is not None:
      all_res.append(res)
  return all_res

In [None]:
def get_max_page(year):
  url = "https://bangumi.tv/anime/browser/airtime/"

  res = requests.get(url+str(year), headers=header)
  res.encoding = 'utf-8'
  soup = BeautifulSoup(res.text, "html.parser")

  max_page = soup.find('span', {'class': 'p_edge'}).text
  max_page = int(re.match("\(\xa0\d+\xa0/\xa0(\d+)\xa0\)", max_page).group(1))
  return year, max_page

In [None]:
with Pool(10) as p:
  max_page_by_year = p.map(get_max_page, range(2000, 2023))
print(max_page_by_year)

[(2000, 11), (2001, 16), (2002, 14), (2003, 16), (2004, 16), (2005, 17), (2006, 20), (2007, 20), (2008, 19), (2009, 22), (2010, 24), (2011, 28), (2012, 30), (2013, 32), (2014, 35), (2015, 35), (2016, 42), (2017, 44), (2018, 41), (2019, 42), (2020, 41), (2021, 44), (2022, 47)]


In [None]:
args = [(page, year) for (year, max_page) in max_page_by_year
        for page in range(1, max_page+1)]

with Pool(30) as p:
  all_res = p.starmap(get_per_page, args)

all_res = [res for res_list in all_res for res in res_list]
print(f"Retrieved {len(all_res)} items")

Retrieved 11187 items


In [None]:
df = pd.DataFrame(all_res, columns=['id', 'score', 'votes', 'year'] + TAGS).fillna(0)
df.to_csv("data.csv")

In [None]:
from numpy import genfromtxt

data = genfromtxt('data.csv', delimiter=',')

In [None]:
df = pd.read_csv('data.csv')

In [None]:
entry = df.loc[10991]
entry[entry > 0]

Unnamed: 0      10991.0
id             331480.0
score               7.6
votes            4383.0
year             2022.0
TV                679.0
漫画改               888.0
百合                920.0
漫改                401.0
校园                860.0
治愈                603.0
日常               1012.0
青春                305.0
CloverWorks      1415.0
日本                 11.0
日本动画                9.0
动画                  6.0
轻百合                53.0
TVA                 9.0
少女                 31.0
Name: 10991, dtype: float64

In [None]:
# filter(lambda i: i.find('img')["src"] != "/img/no_icon_subject.png", items)

22

In [None]:
sum(p for (year, p) in max_page_by_year) / 6 * 146 / 1024

15.564778645833334