# 5chから母集団となるスレのURLを収集

In [None]:
#スレのリンク進捗
lists_file = "Progresses/Population/lists.txt"

#スレのリンク保存するファイル
links_file = "Links/population.txt"

#過去ログ一覧
kakolog_link = "https://medaka.5ch.net/kakolog_servers.html"

#並列数
WORKERS_N = 30

#取得したいリスト数
wanted_lists = 10000

## 過去ログ一覧からターゲットのリンクをランダム選出

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_binary
import concurrent.futures
from threading import Lock
from time import sleep
import datetime
import re

### ブラウザを用意

In [None]:
drivers = []

class Driver:
    def __init__(self, option):
        self.driver = webdriver.Chrome(options=option)
        self.used = False

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
for _ in range(WORKERS_N):
    drivers.append(Driver(chrome_options))

In [None]:
#ひとまず一つだけ使用
driver = drivers[0].driver

In [None]:
#すでにリストURLを取得していたら、リストURL収集をスキップ
try:
    #すでに収集した
    with open(lists_file, "r", encoding="utf-8") as f:
        list_urls = f.read().split()
except:
    #してなかった
    list_urls = []

if len(list_urls) > 0:
    print(list_urls[:5])

### 過去ログからサーバーリンク一覧を取得

In [None]:
server_links = []

driver.get(kakolog_link)

html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

tds = soup.find_all("td")
for td in tds:
    link = td.find("a").get("href")
    server_links.append(link)

server_links[:5], len(server_links)

### サーバーページから板を取得

In [None]:
board_links = []

if list_urls == []:
    for server_link in server_links:
        driver.get(server_link)

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        boards = soup.find_all("p", class_="board_even")
        boards.extend(soup.find_all("p", class_ = "board_odd"))

        for board in boards:
            link_head = board.find("a").get("href")
            link_base = "/".join(server_link.split("/")[:3])
            board_links.append(link_base+link_head)

board_links[:5], len(board_links)


### 板ページからリストURLを

In [None]:
import random

random.seed(334)

In [None]:
#使われていないブラウザを返す
def GetUnusedDriver():
    global drivers
    
    for driver in drivers:
        if driver.used == False:
            return driver
    
    print("ALL USED")


In [None]:
write_lock = Lock()

def scrape(board_link):
    global write_lock

    driver = GetUnusedDriver()
    driver.used = True

    list_urls = []

    try:
        #自分自身
        list_urls.append(board_link)

        driver.driver.get(board_link)

        html = driver.driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        #左のメニューより取得
        menu_div = soup.find("div", class_ = "menu")

        for p in menu_div.find_all("p", class_="menu_link"):
            a = p.find("a")
            #「このサーバー」ではない
            if a != None: 
                if a.get("href")[0] == ".":
                    #Other Listである
                    link = board_link + a.get("href")[2:]
                    list_urls.append(link)
                else:
                    break

        #記録
        with write_lock:
            with open(lists_file, "a", encoding="utf-8") as f:
                for list_url in list_urls:
                    f.write(list_url + "\n")
    
    except Exception as e:
        print(board_link)
        print(str(e))
    
    driver.used = False

    return list_urls

In [None]:
random.shuffle(board_links)

if list_urls == []:
    with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS_N) as executor:
        futures = [executor.submit(scrape, link) for link in board_links]

        #完了まで待つ
        for future in concurrent.futures.as_completed(fs = futures):
            list_urls.extend(future.result())

list_urls[:9], len(list_urls)

### 各リストからスレURLを取得

In [None]:
import random

random.shuffle(list_urls)

In [None]:
write_lock = Lock()

def scrape(list_url):
    global write_lock

    driver = GetUnusedDriver()
    driver.used = True

    output = []

    try:
        driver.driver.get(list_url)

        html = driver.driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        for p in soup.find_all("p", class_ = "main_odd"):
            href = p.find("a").get("href")

            #URLを組み立てる
            link = "https://" + list_url.split("/")[2] +href

            output.append(link) 

        for p in soup.find_all("p", class_ = "main_even"):
            href = p.find("a").get("href")

            #URLを組み立てる
            link = "https://" + list_url.split("/")[2] +href

            output.append(link) 

        with write_lock:
            with open(links_file, "a", encoding="utf-8") as f:
                for thread_link in output:
                    f.write(thread_link + "\n")
    
    except Exception as e:
        print(list_url)
        print(str(e))
    
    driver.used = False

    return output

In [None]:
#リストをランダムに抽出
target_lists = random.sample(list_urls, wanted_lists)

thread_links = []

with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS_N) as executor:
    futures = [executor.submit(scrape, url) for url in list_urls]

    #完了まで待つ
    for future in concurrent.futures.as_completed(fs = futures):
        thread_links.extend(future.result())

thread_links[:5], len(thread_links)