In [None]:
import asyncio
import logging
from contextlib import closing
import aiohttp 
import requests, os
from urllib.parse import urlparse

async def download(url, folder_to, session, semaphore, chunk_size=1<<15):
    async with semaphore: # limit number of concurrent downloads
        a = urlparse(url)
        if not os.path.exists(folder_to):
            os.makedirs(folder_to)
        filename = folder_to+os.path.basename(a.path)
        if not os.path.isfile(filename):
            logging.info('downloading %s', filename)
            try:
                response_f = await session.get(url)
            except:
                print(url+' - не скачался')
            with closing(response_f), open(filename, 'wb') as file:
                while True: # save file
                    chunk = await response_f.content.read(chunk_size)
                    if not chunk:
                        break 
                    file.write(chunk)
            logging.info('done %s', filename)
            return 1
        else:
            return 0 
            
            
async def parallel_download_to_folder (file_urls, folder_to, streams):
    async with aiohttp.ClientSession() as session:
        semaphore = asyncio.Semaphore(streams)
        download_tasks = (download(url, folder_to, session, semaphore) for url in file_urls)
        return await asyncio.gather(*download_tasks)
    

In [None]:
from bs4 import BeautifulSoup
import os, json, time
from datetime import datetime

async def get_all_files_from_4chan_catalog(folder_to_base, board = 'b', streams = 4, logging_level = logging.INFO):
    
    logging.basicConfig(level=logging_level, format='%(asctime)s %(message)s')
    site = 'http://boards.4channel.org/'
    catalog_url = site+board+'/catalog'

    cat_page = requests.get(catalog_url)
    cat = BeautifulSoup(cat_page.text)
    script = cat.find("script", text=lambda text: text and "var catalog" in text)

    json_string = script.text.split('var catalog =')[1].split('};var')[0]+'}' 

    data = json.loads(json_string)
    threads = data["threads"].keys()
    for post in threads:
        start_time = time.time()
        file_urls =[]
        thread_url = site+board+'/thread/'+str(post)
        page = requests.get(thread_url)
        thread = BeautifulSoup(page.text)
        if thread.title == '4chan - 404 Not Found':
            print("Post "+str(post)+' - 404')
            continue
        files = thread.findAll("a", attrs={"class": "fileThumb"})

        file_urls = ['http:'+i.attrs.get("href") for i in files]
        
        if not file_urls:
            print("Post "+str(post)+' - No files (already 404ed)')
            
        folder_to = folder_to_base+board+'/'+str(post)+'/'
        response = await parallel_download_to_folder (file_urls, folder_to, streams)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" Post "+str(post)+": "+str(response.count(0))+" files was here, "+str(response.count(1))+" added. Total: "+str(len(response))+" files. Elapsed "+str(round(elapsed_time,2))+' s.')
    print("That's all Folks!")

In [None]:
await get_all_files_from_4chan_catalog('D:/pd/4chan_files/','b',4,logging.ERROR)