In [1]:
import logging
import os
import shutil
from tqdm.notebook import tqdm

from collector.url_collector import Collector
from config import DOWNLOAD_CONFIG
from crawler.ranking import RankingMeta
from downloader.downloader import Downloader

In [2]:
if not os.path.exists(DOWNLOAD_CONFIG["STORE_PATH"]):
    os.makedirs(DOWNLOAD_CONFIG["STORE_PATH"])
        
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
                level=logging.INFO,
                handlers=[
                    logging.FileHandler(DOWNLOAD_CONFIG["LOG_PATH"]),
                    logging.StreamHandler()
                ])

# ranking:
rm = RankingMeta()
artwork_ids = rm.collect_meta()
logging.info(f"artwork ids: {artwork_ids}")

# collecting
collector = Collector()
collector.add(artwork_ids)
urls = collector.collect()
url_str = "\n".join(urls)
logging.info(f"img urls: {url_str}")

# download
downloader = Downloader()
downloader.add(urls)
totalsize = downloader.download()
logging.info(f"total size of {totalsize} MB downloaded")

2022-08-19 23:04:49,537:INFO:GET: https://www.pixiv.net/ranking.php?p=1&mode=daily&date=20220818&format=json&content=illust
2022-08-19 23:04:50,304:INFO:GET: https://www.pixiv.net/ranking.php?p=2&mode=daily&date=20220818&format=json&content=illust
2022-08-19 23:04:50,306:INFO:COMPLETED DATE: 20220818, total illust id collected: 100
2022-08-19 23:04:51,309:INFO:artwork ids: [100540926, 100540913, 100555333, 100564879, 100564869, 100559860, 100551907, 100541741, 100571086, 100575554, 100558780, 100540957, 100564881, 100544329, 100565098, 100541070, 100582212, 100564854, 100546452, 100579129, 100540932, 100564889, 100581180, 100540976, 100556465, 100541064, 100575798, 100541517, 100566374, 100564958, 100541182, 100541029, 100561470, 100552921, 100564888, 100541108, 100568901, 100560600, 100563740, 100568649, 100547484, 100553639, 100552175, 100564929, 100561795, 100540977, 100542345, 100541030, 100574433, 100575552, 100569231, 100541940, 100549754, 100541492, 100584395, 100540959, 1005713

downloading: 100%|████████████████████████████████| 208/208 [07:18<00:00,  2.11s/it]
2022-08-19 23:12:42,442:INFO:total size of 483.86496925354004 MB downloaded


In [3]:
from datasets import load_dataset
from PIL import Image
from IPython import display
from transformers import ConvNextFeatureExtractor, AutoModelForImageClassification
from torchvision.transforms import (
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    ToTensor,
)
import torch 
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
from transformers import pipeline


In [4]:
path = "./model"
feature_extractor = AutoFeatureExtractor.from_pretrained(path, local_files_only=True)
model = AutoModelForImageClassification.from_pretrained(path, local_files_only=True)
pipe = pipeline("image-classification",
                model=model,
                feature_extractor=feature_extractor)

In [17]:
image_suffix = ['png', 'jpg', 'jpeg', 'bmp']
true_dest = os.path.join(DOWNLOAD_CONFIG["STORE_PATH"], 'true')
if not os.path.exists(true_dest):
    os.mkdir(true_dest)
    
def pipe_label(pipe_obj):
    tuples = [(r['score'], r['label']) for r in pipe_obj]
    tuples.sort()
    return tuples[-1][1]

for img in tqdm(sorted(os.listdir(DOWNLOAD_CONFIG["STORE_PATH"]))):
    if img[img.rfind(".") + 1:].lower() not in image_suffix:
        continue
    img_path = os.path.join(DOWNLOAD_CONFIG["STORE_PATH"], img)
    image = Image.open(img_path).convert("RGB")
    if pipe_label(pipe(image)) == "true":
        shutil.copyfile(img_path, os.path.join(true_dest, img))
        os.remove(img_path)

  0%|          | 0/210 [00:00<?, ?it/s]

[(0.0014312764396890998, 'true'), (0.9985686540603638, 'false')]
[(0.0005228175432421267, 'false'), (0.999477207660675, 'true')]
[(0.2835526764392853, 'false'), (0.7164472937583923, 'true')]
[(0.1864221692085266, 'true'), (0.8135778307914734, 'false')]
[(0.005059160757809877, 'true'), (0.9949408769607544, 'false')]
[(0.005986302625387907, 'true'), (0.9940137267112732, 'false')]
[(6.245411350391805e-05, 'true'), (0.9999375343322754, 'false')]
[(0.0009727710275910795, 'true'), (0.9990272521972656, 'false')]
[(0.21607562899589539, 'false'), (0.783924400806427, 'true')]
[(0.15523748099803925, 'true'), (0.8447625041007996, 'false')]
[(0.014031998813152313, 'false'), (0.9859679937362671, 'true')]
[(0.0021539879962801933, 'true'), (0.9978460073471069, 'false')]
[(7.702937728026882e-05, 'true'), (0.9999229907989502, 'false')]
[(0.00212307320907712, 'true'), (0.9978768825531006, 'false')]
[(0.10658828914165497, 'true'), (0.8934116959571838, 'false')]
[(0.05461689829826355, 'true'), (0.945383071

[(3.294023917987943e-05, 'true'), (0.999967098236084, 'false')]
[(8.558727131458e-06, 'true'), (0.9999914169311523, 'false')]
[(1.7822178051574156e-05, 'true'), (0.9999821186065674, 'false')]
[(0.2355484515428543, 'true'), (0.7644515633583069, 'false')]
[(0.45186856389045715, 'true'), (0.5481313467025757, 'false')]
[(4.593940957420273e-06, 'true'), (0.9999953508377075, 'false')]
[(7.675454980926588e-06, 'true'), (0.9999923706054688, 'false')]
[(0.04849633574485779, 'false'), (0.9515036940574646, 'true')]
[(9.546447472530417e-06, 'true'), (0.9999904632568359, 'false')]
[(0.4416375160217285, 'false'), (0.5583624243736267, 'true')]
[(0.1376999020576477, 'true'), (0.8623000979423523, 'false')]
[(0.0008588842465542257, 'true'), (0.9991410970687866, 'false')]
[(0.38111013174057007, 'true'), (0.6188898086547852, 'false')]
[(3.5993220990349073e-06, 'true'), (0.9999964237213135, 'false')]
[(0.000772531668189913, 'false'), (0.9992275238037109, 'true')]
[(8.397689089179039e-05, 'true'), (0.999915