In [1]:
!pip install requests-html

Collecting requests-html
  Downloading requests_html-0.10.0-py3-none-any.whl.metadata (15 kB)
Collecting pyquery (from requests-html)
  Downloading pyquery-2.0.1-py3-none-any.whl.metadata (9.0 kB)
Collecting fake-useragent (from requests-html)
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Collecting parse (from requests-html)
  Downloading parse-1.20.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting bs4 (from requests-html)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting w3lib (from requests-html)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting pyppeteer>=0.0.14 (from requests-html)
  Downloading pyppeteer-2.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyee<12.0.0,>=11.0.0 (from pyppeteer>=0.0.14->requests-html)
  Downloading pyee-11.1.1-py3-none-any.whl.metadata (2.8 kB)
Collecting websockets<11.0,>=10.0 (from pyppeteer>=0.0.14->requests-html)
  Downloading websockets-10.4-cp310-cp310-manylinux_2_5_x86_6

In [2]:
!pip install lxml_html_clean

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.1


In [3]:
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import re
import nest_asyncio
import pandas as pd 
import datetime
import time

In [4]:
nest_asyncio.apply() 
session = HTMLSession()

### 👉 Crawl urls

In [5]:
listUrl2 = []

for i in range(5000,10000,50):
    # Url of the website to scrap
    url = f'https://myanimelist.net/topmanga.php?limit={i}'

    # Get the html content
    html = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(html, "html.parser")

    # Get the list of manga
    listItem = soup.find_all("td", {"class": "title al va-t clearfix word-break"})

    # Get the url of each manga
    for item in listItem:
        listUrl2.append(item.find('a').get('href'))

    # Print the number of manga urls collected
    print(f'{len(listUrl2)} urls collected', end='\r', flush=True)

listUrl2

5000 urls collected

['https://myanimelist.net/manga/47537/Stay_Gold',
 'https://myanimelist.net/manga/187/Kindan_no_Koi_wo_Shiyou',
 'https://myanimelist.net/manga/334/Gravitation__The_Novel',
 'https://myanimelist.net/manga/530/Snow_Drop',
 'https://myanimelist.net/manga/543/Akuma_no_Ororon',
 'https://myanimelist.net/manga/686/Atashi_wa_Bambi',
 'https://myanimelist.net/manga/846/Eagle',
 'https://myanimelist.net/manga/1081/Slayers__Knight_of_the_Aqualord',
 'https://myanimelist.net/manga/1658/Dororo',
 'https://myanimelist.net/manga/3028/MANA',
 'https://myanimelist.net/manga/3266/Koi_ni_Ochita_Oujisama',
 'https://myanimelist.net/manga/4344/Aflame_Inferno',
 'https://myanimelist.net/manga/5242/Peter_Panda',
 'https://myanimelist.net/manga/5258/Seishun_Shiteru_Kai',
 'https://myanimelist.net/manga/7402/Dou_Danjo',
 'https://myanimelist.net/manga/8585/Hanako_to_Guuwa_no_Teller',
 'https://myanimelist.net/manga/8698/Do-Re-Mi-Fa-Sol-La-Si-Do',
 'https://myanimelist.net/manga/9144/Fire_Emblem__Hasha_no_Tsu

### 👉 Concatenate 4 list urls

In [6]:
listUrl = listUrl2
print(f'Total: {len(listUrl)} urls collected')

Total: 5000 urls collected


In [7]:
with open("/kaggle/working/link_collecting_2.txt", "w") as file:
    file.writelines(item + "\n" for item in listUrl2)

<a class="anchor" id="collect_data"></a>

## <span style='color:#2B9C15 '> 📕 Collect data of each manga  </span>
1. From each url collected above, send a GET request to get the HTML content of the page.
2. If length of the HTML content is smaller than 4000 , sleep for 10 seconds and send the GET request again. Because that means the website has blocked the connection and we need to wait for a while before sending the request again.
3. Save the HTML content in a list for parsing later.

This process still splits into 2 parts, each part collects 5000 HTML contents to avoid the connection being interrupted by the website due to too many requests.

### 👉 Crawl HTML content from the 20000 manga/light novel/... URLs

In [8]:
listHtml1 = []

for url in listUrl[0:5000]:
    res = session.get(url)
    while len(res.text) < 4000:
        # Sleep for 10 minutes
        time.sleep(200)
        res = session.get(url)
        
    listHtml1.append(res.text)

    # Print the number of manga html collected
    print(f'{len(listHtml1)}/{len(listUrl)} manga html collected', end='\r', flush=True)

5000/5000 manga html collected

In [9]:
# Extract time of data collection to report for the project
now = datetime.datetime.now()
now = now.strftime("%Y-%m-%d")
print("Time of data collection: ", now)

Time of data collection:  2024-11-17


### 👉 Concatenate 2 list htmls

In [10]:
listHtml = listHtml1
print(f'Total: {len(listHtml)} manga html collected')

Total: 5000 manga html collected


### 👉 Extracting the detailed values of each comic website page

1. Parsing HTML Content: The function starts by using BeautifulSoup to parse the HTML content of a comic page

2. Extracting Title:
    - The title of the comic is extracted using `soup.find('span', {'itemprop': 'name'})`
    - If the title is not found (i.e., None), the function returns None to indicate that the information couldn't be extracted

3. Handling English Title (title-english):
    - If an English title is present (indicated by the presence of a title-english span), it is extracted and removed from the main title. The resulting title is a combination of the original title and the English title enclosed in parentheses

4. Extracting Rating Information:
    - The rating information is extracted using `soup.find('span', {'itemprop': 'ratingValue'}).text` and `soup.find('span', {'itemprop': 'ratingCount'}).text`
    - These represent the rating value and the count of ratings, respectively

5. Extracting Rank and Popularity:
    - The rank and popularity are extracted using regular expressions `(re.findall)`
    - The regular expression `r'\d+'` is used to find all sequences of digits in the text, and [0] is used to select the first match

6. Looping Through Information Sections:
    - The function iterates through the information sections of the manga page, represented by `div` elements with the `class 'spaceit_pad'`

7. Extracting Manga Details:
    - For each section, it checks the content and extracts relevant details such as `volumes`, `chapters`, `status`, `published date`, `genres`, `themes`, `authors`, `favorites`, and `members`
    - The extracted information is stored in the respective variables

8. Extracting Review Information:
    - The function then moves to the `'manga-info-review__header'` section to extract information related to reviews
    - It retrieves the `total number` of reviews and the number of reviews for each type (`recommended`, `mixed-feelings`, `not-recommended`)

9. Returning a Dictionary:  
    - The function compiles all the extracted information into a dictionary and convert to Pandas



In [11]:
print(len(listHtml))
print(listHtml[4999])

5000

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html lang="en" xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml" class="appearance-none">
<head>
    
<link rel="preconnect" href="//www.googletagmanager.com/" crossorigin="anonymous"/>
<link rel="preconnect" href="https://cdn.myanimelist.net" crossorigin="anonymous"/>
<title>
Ovally Trap | One-shot - MyAnimeList.net
</title>
<meta name="description" content="Looking for information on the manga Ovally Trap? Find out more with MyAnimeList, the world&#039;s most active online anime and manga community and database. Tokunaga-kun, Chiharu Taniyama&#039;s classmate and recent neighbor, is always laughing at her, and it annoys her to no end. How can she be cute? Then, she finds a magazine with tips on being popular. Well she get her revenge on Tokunaga?" />

  
<meta name="keywords" content="anime, myanimelist, anime news, manga" />
  

In [15]:
def extract_info(htmlComic):
    soup = BeautifulSoup(htmlComic, "html.parser")

    title = soup.find('span', {'itemprop': 'name'})
    if title is None:
        return None
    else:
        title_text = title.text.strip()
        title_english_span = title.find('span', {'class': 'title-english'})

        if title_english_span is not None:
            title_english_text = title_english_span.text.strip()
            title_text = title_text.replace(title_english_text, '')
            title = f'{title_text} ({title_english_text})'
        else:
            title = title_text
    
    ratingValue = soup.find('span', {'itemprop': 'ratingValue'}).text
    ratingCount = soup.find('span', {'itemprop': 'ratingCount'}).text
    ranked = re.findall(r'\d+', soup.find('span', {'class': 'numbers ranked'}).text)[0]
    popularity = re.findall(r'\d+', soup.find('span', {'class': 'numbers popularity'}).text)[0]

    volumes, chapters, status, published = '', '', '', ''
    genres, themes, authors, favorites, members = [], [], '', '', ''
    type_, demographic, serialization = '', '', ''

    for space in soup.find_all("div", {'class': 'spaceit_pad'}):
        text = space.text.strip()
        
        if 'Type:' in text:
            type_ = text.split(':', 1)[1].strip()
        elif 'Volumes:' in text:
            volumes = text.split(':', 1)[1].strip()
        elif 'Chapters:' in text:
            chapters = text.split(':', 1)[1].strip()
        elif 'Status:' in text:
            # Lấy nội dung sau thẻ <span class="dark_text">
            status = space.find('span', {'class': 'dark_text'}).next_sibling.strip()
        elif 'Published:' in text:
            published = text.split(':', 1)[1].strip()
        elif 'Genres:' in text or 'Genre:' in text:
            genres = [gen.text.strip() for gen in space.find_all('a')]
        elif 'Themes:' in text or 'Theme:' in text:
            # Lấy cả giá trị từ <a> và <span itemprop="genre">
            themes = [theme.text.strip() for theme in space.find_all('a')]
        elif 'Demographic:' in text or 'Demographics:' in text:
            demographic = space.find('a').text.strip()
        elif 'Serialization:' in text or 'Serializations:' in text:
            # serialization = space.find('a').text.strip()
            serialization_tag = space.find('a')  # Tìm thẻ <a>
            serialization = serialization_tag.text.strip() if serialization_tag else ''  # Kiểm tra nếu không tìm thấy
        elif 'Authors:' in text or 'Author:' in text:
            authors = text.split(':')[1].strip()
            # authors = space.find('a').text.strip()
            # author_tag = space.find('a')  # Tìm thẻ <a>
            # authors = author_tag.text.strip() if author_tag else ''  # Kiểm tra nếu không tìm thấy
        elif 'Favorites:' in text:
            favorites = text.split(':', 1)[1].strip()
        elif 'Members:' in text:
            members = text.split(':', 1)[1].strip()

    infoReviews = soup.find('div', {'class': 'manga-info-review__header mal-navbar'})
    totalReviews = re.findall(r'\d+', infoReviews.find('div', {'class': 'right'}).text)[0]

    typeReview = [
        int(re.findall(r'\d+', infoReviews.find('div', {'class': 'recommended'}).text)[0]),
        int(re.findall(r'\d+', infoReviews.find('div', {'class': 'mixed-feelings'}).text)[0]),
        int(re.findall(r'\d+', infoReviews.find('div', {'class': 'not-recommended'}).text)[0])
    ]

    return {
        "Title": title, "Score": ratingValue, "Vote": ratingCount,
        "Ranked": ranked, "Popularity": popularity, "Members": members,
        "Favorite": favorites, "Types": type_, "Volumes": volumes, 
        "Chapters": chapters, "Status": status, "Published": published, 
        "Genres": genres, "Themes": themes, "Demographic": demographic, "Serialization": serialization, 
        "Author": authors, "Total Review": totalReviews, "Type Review": typeReview
    }

# data_list = [extract_info(htmlComic) for htmlComic in listHtml if extract_info(htmlComic) is not None]
# df = pd.DataFrame(data_list)
data_list = []
for idx, htmlComic in enumerate(listHtml, start=1):
    result = extract_info(htmlComic)
    if result is not None:
        data_list.append(result)
    # In trạng thái sau khi duyệt mỗi phần tử
    print(f"Đã xử lý {idx}/{len(listHtml)} phần tử.", end='\r', flush=True)
    # print(f'{len(listUrl1)} urls collected', end='\r', flush=True)

df = pd.DataFrame(data_list)

Đã xử lý 5000/5000 phần tử.

In [16]:
df.head()

Unnamed: 0,Title,Score,Vote,Ranked,Popularity,Members,Favorite,Types,Volumes,Chapters,Status,Published,Genres,Themes,Demographic,Serialization,Author,Total Review,Type Review
0,Stay Gold,7.28,504,5001,9276,2045,11,Manga,6,43,Finished,"Nov 28, 2012 to Dec 24, 2020",[Boys Love],[],,onBLUE,Hideyoshico (Story & Art),1,"[0, 1, 0]"
1,Kindan no Koi wo Shiyou,7.28,2540,5002,4468,4826,35,Manga,1,5,Finished,2000,"[Drama, Fantasy, Romance]",[],Josei,Petit Comic,"Ohmi, Tomu (Story & Art)",2,"[1, 0, 1]"
2,Gravitation: The Novel,7.28,639,5003,11735,1499,22,Light Novel,1,8,Finished,2000 to ?,"[Boys Love, Comedy]",[],,,"Murakami, Maki (Art), Jun, Renon (Story)",0,"[0, 0, 0]"
3,Snow Drop,7.28,979,5004,7807,2539,42,Manhwa,12,39,Finished,Sep 1998 to 2003,"[Drama, Romance, Slice of Life]",[],Shoujo,Issue,"Choi, Kyung-ah (Story & Art)",2,"[2, 0, 0]"
4,Akuma no Ororon (The Demon Ororon),7.28,1504,5005,6354,3254,143,Manga,4,21,Finished,Apr 1998 to Jan 2001,"[Fantasy, Romance, Supernatural]",[],Shoujo,Wings,"Hakase, Mizuki (Story & Art)",5,"[3, 1, 1]"


In [18]:
df.to_csv('/kaggle/working/raw_manga_2.csv', encoding='utf-8-sig', index=False)