# 5chから母集団となるスレをランダム抽出する

In [1]:
#リンク保存されたファイル
links_file = "Links/%s.txt"%("population")

#テキスト保存するファイル
texts_file = "Only_Texts/%s.txt"%("population")

#抽出するスレ数
wanted = 10000

#並列数
WORKERS_N = 30

## リンク集を取得

In [2]:
import random

random.seed(334)

In [3]:
with open(links_file, "r", encoding="utf-8") as f:
    thread_links = f.read().split()

### リンクをランダム抽出

In [4]:
thread_links = random.sample(thread_links, wanted)

#並べ替える
random.shuffle(thread_links)

## スレをスクレイピング

In [5]:
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_binary
import concurrent.futures
from threading import Lock
from time import sleep
import datetime
import re

In [6]:
drivers = []

class Driver:
    def __init__(self, option):
        self.driver = webdriver.Chrome(options=option)
        self.used = False

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
for _ in range(WORKERS_N):
    drivers.append(Driver(chrome_options))

In [7]:
def get_from_current(soup):
    thread_div = soup.find("div", class_="thread")
    posts_div = thread_div.find_all("div", class_="post")

    contents = []

    for post_div in posts_div:
        #内容
        content = post_div.find("div", class_="message").text
        contents.append(content)

    return contents

def get_from_past(soup):
    #投稿内容
    dd = soup.find_all("dd")
    dd[0].find("div").decompose()

    contents = []
    for cnt in range(len(dd)):

        #投稿内容
        content = dd[cnt].text      
        contents.append(content)

    return contents

In [8]:
#ファイル記入重複防止
write_lock = Lock()

def process_content(content):
    result = content.replace("\t", "<\\t>").replace("\n", "<\\br>")
    return result

#レス、スレファイルへの記入
def write_response(f, content):
    f.write(process_content(content)+"\n")

#使われていないブラウザを返す
def GetUnusedDriver():
    global drivers
    
    for driver in drivers:
        if driver.used == False:
            return driver
    
    print("ALL USED")

def scrape(link):
    global write_lock
    #ブラウザを使い始める
    driver = GetUnusedDriver()
    if driver != None:
        driver.used = True
    else:
        #ブラウザがなかった
        return
    
    try:
        #アクセス
        while True:
            driver.driver.get(link)

            sleep(0.5)

            html = driver.driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            #人大杉対策
            title = soup.find("title").text
            
            if ("error" in title):
                #時間をおいてやり直し
                sleep(1)
            else:
                #アクセス完了
                break

        #現在型か過去型か判別
        if soup.find_all("meta")[1].get("property") == "og:title":
            current = False
        else:
            current = True
        
        #スクレイピング
        if current:
            contents = get_from_current(soup)
        else:
            contents = get_from_past(soup)

        #同時書き込み防止
        with write_lock:
            #スレ・レス記入
            found_keyword = []
            with open(texts_file, "a", encoding="utf-8") as f:
                for content in contents:
                    write_response(f, content)

    except Exception as e:
        print(link)
        print(str(e))

    #使い終わった
    driver.used = False

In [9]:
random.shuffle(thread_links)

with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS_N) as executor:
    futures = [executor.submit(scrape, link) for link in thread_links]

    #完了まで待つ
    _ = concurrent.futures.as_completed(fs = futures)