In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
identities = os.listdir('data/casia_webface/casia-webface')
identities_counts = []
for identity in tqdm(identities):
    photos = os.listdir(f'data/casia_webface/casia-webface/{identity}')
    identities_counts.append((identity, len(photos)))

identities_counts = pd.DataFrame(identities_counts, columns=['identity', 'count'])

100%|██████████| 10572/10572 [00:00<00:00, 50257.43it/s]


In [3]:
identities_counts.sort_values(by='count', ascending=False)

Unnamed: 0,identity,count
9278,000819,802
3291,003410,744
4930,000138,710
9801,001321,651
4712,003384,648
...,...,...
1133,000759,5
7654,009804,5
8565,004383,4
2776,004328,4


In [4]:
good_identities = identities_counts[(identities_counts['count'] >= 80) & (identities_counts['count'] <= 100)].sort_values(by='count')
good_identities

Unnamed: 0,identity,count
9843,004152,80
4920,010350,80
9967,008469,80
9954,009087,80
3172,005833,80
...,...,...
2607,000581,100
8391,009771,100
5764,005993,100
5258,006094,100


In [5]:
# path_to_save = 'data/good_photos_from_casia_webface'

# os.system(f'mkdir {path_to_save}')

# for identity in good_identities['identity']:
#     os.system(f'cp -r data/casia_webface/casia-webface/{identity} {path_to_save}/{identity}')

In [6]:
from facenet_pytorch import MTCNN, InceptionResnetV1
import torch
import cv2

In [7]:
import facenet_pytorch

In [8]:
mtcnn = MTCNN(select_largest=True, keep_all=False)

In [9]:
resnet = InceptionResnetV1(pretrained='vggface2').eval()

In [10]:
data = []
for identity in tqdm(good_identities['identity']):
    directory = os.listdir(f'data/casia_webface/casia-webface/{identity}')

    key = np.random.randint(0, 2, (256,)).astype(np.float16)

    for file in directory:
        face_image = cv2.imread(f'data/casia_webface/casia-webface/{identity}/{file}')[:, :, [2, 1, 0]]

        try:
            face_image_mtcnn = mtcnn(face_image).unsqueeze(0)
            face_embedding = resnet(face_image_mtcnn).detach()[0].tolist()

            sample = [identity] + [file] + list(face_embedding) + list(key)

            data.append(sample)
        except:
            continue

  0%|          | 0/395 [00:00<?, ?it/s]

In [10]:
errors = 0
for identity in tqdm(good_identities['identity'][:10]):
    directory = os.listdir(f'data/casia_webface/casia-webface/{identity}')

    for file in directory:
        face_image = cv2.imread(f'data/casia_webface/casia-webface/{identity}/{file}')[:, :, [2, 1, 0]]

        try:
            face_image_mtcnn = mtcnn(face_image).unsqueeze(0)
        except:
            errors += 1

100%|██████████| 10/10 [00:14<00:00,  1.41s/it]


In [11]:
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

In [16]:
errors = 0
for identity in tqdm(good_identities['identity'][:10]):
    directory = os.listdir(f'data/casia_webface/casia-webface/{identity}')

    for file in directory:
        face_image = cv2.imread(f'data/casia_webface/casia-webface/{identity}/{file}')[:, :, [2, 1, 0]]

        try:
            gray = cv2.cvtColor(face_image, cv2.COLOR_RGB2GRAY)
            faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

        except:
            errors += 1

100%|██████████| 10/10 [00:03<00:00,  2.62it/s]


In [10]:
del mtcnn, resnet

In [12]:
df = pd.DataFrame(data)
df.columns = ['name', 'file'] + [f'emb_{i}' for i in range(512)] + [f'key_{i}' for i in range(256)]
df

Unnamed: 0,name,file,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,key_246,key_247,key_248,key_249,key_250,key_251,key_252,key_253,key_254,key_255
0,004152,00241344.jpg,0.036283,0.042078,0.025036,0.041913,-0.032156,0.012091,-0.068348,0.031396,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
1,004152,00241365.jpg,0.077891,-0.000068,0.046426,0.005960,-0.025793,0.007154,-0.083407,0.012531,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
2,004152,00241343.jpg,0.060470,0.048600,0.014274,0.004930,0.018026,-0.003183,-0.108397,0.041730,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
3,004152,00241385.jpg,-0.074492,-0.009755,0.007401,0.064403,0.118198,-0.024301,-0.043870,-0.002401,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
4,004152,00241402.jpg,0.064944,0.027524,0.026248,0.007190,0.030767,-0.023658,-0.072055,0.037235,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35236,008087,00410202.jpg,-0.061444,0.016501,-0.011848,-0.036570,0.053534,0.041560,-0.017496,-0.007807,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
35237,008087,00410266.jpg,-0.007946,0.015041,0.018337,-0.031757,0.080362,0.130959,0.003852,-0.060979,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
35238,008087,00410247.jpg,-0.035268,0.021119,0.022884,-0.039560,0.066428,0.097488,-0.017552,-0.025954,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
35239,008087,00410245.jpg,-0.035900,0.024961,0.025969,-0.059144,0.045995,0.141877,0.005680,-0.040441,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0


In [13]:
df.to_parquet('data/casia_webface/data.parquet')

In [17]:
del df, data

In [10]:
# import pickle

# with open('data/casia_webface/data.pickle', 'wb') as f:
#     pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
# import pickle

# with open('data/casia_webface/data.pickle', 'rb') as f:
#     data = pickle.load(f)

In [None]:
df = pd.DataFrame(data)

In [None]:
columns = ['name', 'file'] + [f'emb_{i}' for i in range(512)] + [f'key_{i}' for i in range(256)]
df.columns = columns

In [None]:
df.to_parquet('data/casia_webface/data.parquet')

### pure_photos

In [5]:
data = []
for identity in tqdm(['000010_done', '000016_done', '000029_done']):
    directory = os.listdir(f'data/good_photos_from_casia_webface/{identity}')

    key = np.random.randint(0, 2, (256,)).astype(np.float16)

    for file in directory:
        face_image = cv2.imread(f'data/good_photos_from_casia_webface/{identity}/{file}')[:, :, [2, 1, 0]]

        try:
            face_image_mtcnn = mtcnn(face_image).unsqueeze(0)
            face_embedding = resnet(face_image_mtcnn).detach()[0].tolist()

            sample = [identity, file] + list(face_embedding) + list(key)

            data.append(sample)
        except:
            continue

100%|██████████| 3/3 [00:15<00:00,  5.08s/it]


In [6]:
df = pd.DataFrame(data)

In [7]:
columns = ['name', 'file'] + [f'emb_{i}' for i in range(512)] + [f'key_{i}' for i in range(256)]
df.columns = columns

In [8]:
df.to_parquet('data/casia_webface/data_pure.parquet')

## кластеризация

In [None]:
df = pd.read_parquet()