In [7]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs

In [8]:
def scrape_zongheng(url):
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    title = soup.find("div", attrs={"class": "book-info--title"}).get_text(strip=True)
    div_tags = soup.find("div", attrs={"class": "book-info--tags"})
    tags = []
    serial_status = None
    category = None
    for span in div_tags.find_all("span"):
        try:
            if span["class"][0] == "serialStatus":
                # serial_status = "ongoing" if span.get_text(strip=True) == "连载中" else "completed"
                serial_status = span.get_text(strip=True)
                tags.append(serial_status)
            elif span["class"][0] == "cateFineId":
                category = span.get_text(strip=True)
                tags.append(category)
        except KeyError:
            tags.append(span.get_text(strip=True))
    recent_chapter_info = soup.find(
        "div", attrs={"class": "book-info--chapter-name"}
    ).find(
        "a", attrs={"class": "global-hover"}
    )
    word_count = recent_chapter_info["title"].split("字数：")[1].split()[0]
    author = soup.find("a", attrs={"class": "author-info--name"}).get_text(strip=True)
    return {
        "title": title,
        "author": author,
        "tags": tags,
        "serial_status": serial_status,
        "category": category,
        "word_count": word_count,
    }

scrape_zongheng("https://www.zongheng.com/detail/325639")

{'title': '星辰之主',
 'author': '减肥专家',
 'tags': ['连载中', '科幻', '星空', '未来高武', '伪科幻'],
 'serial_status': '连载中',
 'category': '科幻',
 'word_count': '7515798'}

In [9]:
def scrape_jjwxc(url):
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    title = soup.find("span", attrs={"itemprop": "articleSection"}).get_text(strip=True)
    tags = soup.find("span", attrs={"itemprop": "genre"}).get_text(strip=True).split("-")
    serial_status = soup.find("span", attrs={"itemprop": "updataStatus"}).get_text(strip=True)
    word_count = int(soup.find("span", attrs={"itemprop": "wordCount"}).get_text(strip=True)[:-1])
    tags.append(serial_status)
    extra_tags = soup.find_all("div", attrs={"class": "smallreadbody"})
    for div in extra_tags:
        if div.find("span") is not None:
            extra = div.find_all("a", attrs={"style": "text-decoration:none;color: red;"})
            for a in extra:
                tags.append(a.get_text(strip=True))
    author = soup.find("span", attrs={"itemprop": "author"}).get_text(strip=True)
    return {
        "title": title,
        "author": author,
        "tags": tags,
        "serial_status": serial_status,
        "category": None,
        "word_count": word_count,
    }

scrape_jjwxc("https://www.jjwxc.net/onebook.php?novelid=6079968")

{'title': '见春天',
 'author': '纵虎嗅花',
 'tags': ['原创', '言情', '近代现代', '爱情', '完结', '花季雨季', '情有独钟', '阴差阳错', '校园', '正剧'],
 'serial_status': '完结',
 'category': None,
 'word_count': 191449}

In [10]:
def scrape_17k(url):
    response = requests.get(url)
    soup = bs(response.content, "html.parser")

In [11]:
def scrape_qidian(url):
    response = requests.get(url)
    soup = bs(response.content, "html.parser")

In [26]:
syosetu_genre_map = {
    0: "未選択〔未選択〕",
    101: "異世界〔恋愛〕",
    102: "現実世界〔恋愛〕",
    201: "ハイファンタジー〔ファンタジー〕",
    202: "ローファンタジー〔ファンタジー〕",
    301: "純文学〔文芸〕",
    302: "ヒューマンドラマ〔文芸〕",
    303: "歴史〔文芸〕",
    304: "推理〔文芸〕",
    305: "ホラー〔文芸〕",
    306: "アクション〔文芸〕",
    307: "コメディー〔文芸〕",
    401: "VRゲーム〔SF〕",
    402: "宇宙〔SF〕",
    403: "空想科学〔SF〕",
    404: "パニック〔SF〕",
    9901: "童話〔その他〕",
    9902: "詩〔その他〕",
    9903: "エッセイ〔その他〕",
    9904: "リプレイ〔その他〕",
    9999: "その他〔その他〕",
    9801: "ノンジャンル〔ノンジャンル〕",
}

syosetsu_nocgenre_map = {
    1: "ノクターンノベルズ(男性向け)",
    2: "ムーンライトノベルズ(女性向け)",
    3: "ムーンライトノベルズ(BL)",
    4: "ミッドナイトノベルズ(大人向け)",
}

def syosetsu(ncode):
    response = requests.get(f"https://api.syosetu.com/novelapi/api/?out=json&ncode={ncode}").json()[1]
    return_me = {
        "title": response["title"],
        "tags": response["keyword"].split(),
        "author": response["writer"],
        "chapters": response["general_all_no"],
        "word_count": response["length"],
        "published_at": response["general_firstup"],
        "serial_status": "連載中" if response["novel_type"] == 1 and response["end"] == 1 else "完結済",
        "category": syosetu_genre_map[response["genre"]],
    }
    return_me["tags"].append(return_me["category"])
    return return_me

def syosetsu_r18(ncode):
    response = requests.get(f"https://api.syosetu.com/novel18api/api/?out=json&ncode={ncode}").json()[1]
    return_me = {
        "title": response["title"],
        "tags": response["keyword"].split(),
        "author": response["writer"],
        "chapters": response["general_all_no"],
        "word_count": response["length"],
        "published_at": response["general_firstup"],
        "serial_status": "連載中" if response["novel_type"] == 1 and response["end"] == 1 else "完結済",
        "category": syosetsu_nocgenre_map[response["nocgenre"]],
    }
    return_me["tags"].append(return_me["category"])
    return return_me

In [18]:
syosetsu("n7466jx")

{'title': '異界冒険譚',
 'tags': ['異世界転移',
  'ギャグ',
  'シリアス',
  'ほのぼの',
  '男主人公',
  '冒険',
  '日常',
  'ハッピーエンド',
  'ラブコメ',
  'バトル',
  '聖女',
  'ハイファンタジー〔ファンタジー〕'],
 'author': 'とーふ',
 'chapters': 272,
 'word_count': 846366,
 'published_at': '2025-01-01 18:10:00',
 'serial_status': '連載中',
 'category': 'ハイファンタジー〔ファンタジー〕'}

In [17]:
# Short stories only have 1 chapter and their serial status will just be marked as completed
syosetsu("n1145lc")

{'title': 'とんずらした兄のことは許さないけれど、兄の元婚約者と幸せな家庭を築いた僕の話。',
 'tags': ['アイリスIF7大賞',
  '貴族',
  '鉄道',
  '恋愛',
  '結婚',
  '婚約破棄',
  '駆け落ち',
  'プロポーズ',
  '円満',
  '異世界〔恋愛〕'],
 'author': '葵ふたば',
 'chapters': 1,
 'word_count': 18632,
 'published_at': '2025-09-19 09:53:42',
 'serial_status': '完結済',
 'category': '異世界〔恋愛〕'}

In [27]:
syosetsu_r18("n6752ch")

{'title': '王太子妃になんてなりたくない！！',
 'tags': ['残酷な描写あり',
  '異世界転生',
  '異世界転移',
  'ハッピーエンド',
  'R15',
  'ファンタジー',
  '恋愛',
  '美形',
  '女性視点',
  '処女',
  '王子',
  '執着',
  'らぶえっち',
  '溺愛',
  '男性視点',
  '腹黒',
  'ときどきコメディ',
  '絶倫',
  'ムーンライトノベルズ(女性向け)'],
 'author': '月神サキ',
 'chapters': 814,
 'word_count': 2883021,
 'published_at': '2014-09-24 16:58:30',
 'serial_status': '連載中',
 'category': 'ムーンライトノベルズ(女性向け)'}