In [1]:
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import requests
from io import BytesIO
import cv2
from collections import Counter
from tqdm import tqdm_notebook

In [2]:
safebouru_path = "../safebooru/all_data.csv"
out_path = "data/safebouru-faces/" 

In [3]:
anime_face_cascade_path = "../lbpcascade_animeface/lbpcascade_animeface.xml"

In [4]:
data = pd.read_csv(safebouru_path)

In [5]:
forbidden_tag = ["drawing",
           "sketch",
           "monochrome"] 

In [6]:
def im_gen(error_counter, start = 0) :
    current_index = start-1
    while current_index < len(data) :
        current_index += 1
        forbiden_tag_seen = False
        for tag in forbidden_tag :
            if tag in data.iloc[current_index,8] :
                forbiden_tag_seen = True
                error_counter["forbidden_tag"] += 1
        if not forbiden_tag_seen :
            url = data.loc[current_index,"sample_url"]
            response = requests.get("http://" + url[2:])
            if response.status_code == 404 :
                error_counter["404"] += 1
                continue
            im = Image.open(BytesIO(response.content))
            if im.mode != "RGB" :
                error_counter[im.mode] += 1
                continue
            yield im, current_index, data.iloc[current_index,8]

In [32]:
def face_gen(error_counter, start = 0) :
    gen = im_gen(error_counter, start)
    face_cascade = cv2.CascadeClassifier(anime_face_cascade_path)
    x_target = 36.3877551
    w_target = 104.44897959
    y_target = 72.02040816
    h_target = 104.44897959
    w_final = 198
    h_final = 218
    for im,idx,tags in gen :
        try :
            im = np.array(im)
            positions = face_cascade.detectMultiScale(im)
            for i,pos in enumerate(positions) :
                fx,fy,fw,fh = pos
                h,w,_ = im.shape
                x_start = fx-fw/w_target*x_target
                x_end = x_start + fw/w_target*w_final
                y_start = fy-fw/w_target*y_target
                y_end = y_start + fw/w_target*h_final
                x_start,x_end,y_start,y_end = [int(v) for v in (x_start,x_end,y_start,y_end)]
                if y_start >= 0 and x_start >= 0 and y_end < h and x_end < w :
                    res = im[y_start:y_end,x_start:x_end]
                    detected2 = face_cascade.detectMultiScale(res)
                    if len(detected2) == 1 :
                        if x_end - x_start > w_final :
                            yield res, idx, tags
                        else :
                            error_counter["too_small"] += 1
                    else :
                        error_counter["found_%d_faces" % len(detected2)] += 1
                else :
                    error_counter["rect_not_in_image"] += 1
        except Exception :
            continue

In [40]:
def extract_faces(start = 0, number=1000, start_index=0) :
    meta_data = []
    c = Counter()
    gen = face_gen(c, start_index)
    current_idx = start
    try :
        for i in tqdm_notebook(range(number)) :
            im, idx, tags = next(gen)
            im = Image.fromarray(im)
            im = im.resize((198,218))
            im.save(out_path + "{:09d}".format(current_idx) + ".jpg")
            current_idx+=1
            meta_data.append((current_idx,idx,tags))
    finally :
        print("last read:", current_idx)
        df = pd.DataFrame(meta_data, columns=["number","idx_in_safebouru", "tags"])
        df = df.set_index("number")
        df.to_csv("{:09d}.csv".format(start))
    return c

In [None]:
c = extract_faces(start = 21001, start_index = 77840, number=50000)

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

  " Skipping tag %s" % (size, len(data), tag)


In [24]:
df = pd.read_csv("000000001.csv")

In [38]:
df3 = pd.read_csv("000011001.csv")

In [39]:
df3

Unnamed: 0,number,idx_in_safebouru,tags
0,11002,43287,battle_angel_alita gally gunnm tagme
1,11003,43292,fujino_shizuru kuga_natsuki mai-hime tagme
2,11004,43310,makai_tenshi_jibril tagme
3,11005,43310,makai_tenshi_jibril tagme
4,11006,43321,.hack// kunisaki_rena kunisaki_shugo tagme
...,...,...,...
9995,20997,77817,bad_id bag brown_eyes brown_hair durarara!! ho...
9996,20998,77820,animal_ears beatrice blonde_hair blue_eyes blu...
9997,20999,77837,bob_cut bow cigarette durarara!! glasses heiwa...
9998,21000,77839,durarara!! glasses heiwajima_shizuo jacket jew...


In [28]:
df2

Unnamed: 0,number,idx_in_safebouru,tags
0,0,4394,blush flat_chest food genderbender kiss koizum...
1,1,4396,alternate_hair_color cape closed_eyes dress et...
2,2,4402,2girls aqua_eyes blush bow brown_eyes brown_ha...
3,3,4405,blush breasts cheerleader elbow_gloves flat_ch...
4,4,4405,blush breasts cheerleader elbow_gloves flat_ch...
...,...,...,...
9995,9995,43274,angel bench chomo club_maniax feathers maid wings
9996,9996,43276,club_maniax elf kazutaka_takimiya tagme tzelem...
9997,9997,43278,club_maniax roof staff studio_sdt yuuki_tatsuya
9998,9998,43279,chibi kamichu music tagme


In [31]:
Exception

Exception