In [2]:
import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob
import seaborn as sns

In [3]:
dataset_folder_name = os.environ['file_path']

In [4]:
dataset_dict = {
    'race_id': {
        0: 'white', 
        1: 'black', 
        2: 'asian', 
        3: 'indian', 
        4: 'others'
    },
    'gender_id': {
        0: 'male',
        1: 'female'
    }
}

In [5]:
dataset_dict['gender_alias'] = dict((g, i) for i, g in dataset_dict['gender_id'].items())
dataset_dict['race_alias'] = dict((g, i) for i, g in dataset_dict['race_id'].items())

In [6]:
def parse_info_from_file(path):
    try:
        filename = os.path.split(path)[1]
        filename = os.path.splitext(filename)[0]
        age, gender, race, _ = filename.split('_')
        return int(age), dataset_dict['gender_id'][int(gender)], dataset_dict['race_id'][int(race)]
    except Exception as ex:
        return None, None, None

In [7]:
def parse_dataset(dataset_path, ext = 'jpg'):
    files = glob.glob(os.path.join(dataset_path, "*.%s" % ext))
    records = []
    for file in files:
        info = parse_info_from_file(file)
        records.append(info)
    df = pd.DataFrame(records)
    df['file'] = files
    df.columns = ['age', 'gender', 'race', 'file']
    df = df.dropna()
    return df

In [8]:
info_df = parse_dataset(dataset_folder_name)
info_df

Unnamed: 0,age,gender,race,file
0,100.0,male,white,D:\age_data\archive\UTKFace\100_0_0_2017011221...
1,100.0,male,white,D:\age_data\archive\UTKFace\100_0_0_2017011221...
2,100.0,female,white,D:\age_data\archive\UTKFace\100_1_0_2017011018...
3,100.0,female,white,D:\age_data\archive\UTKFace\100_1_0_2017011221...
4,100.0,female,white,D:\age_data\archive\UTKFace\100_1_0_2017011221...
...,...,...,...,...
23702,9.0,female,indian,D:\age_data\archive\UTKFace\9_1_3_201612202228...
23703,9.0,female,indian,D:\age_data\archive\UTKFace\9_1_3_201701042229...
23704,9.0,female,others,D:\age_data\archive\UTKFace\9_1_4_201701032006...
23705,9.0,female,others,D:\age_data\archive\UTKFace\9_1_4_201701032008...


In [9]:
info_df.sort_values('age', ascending = False)

Unnamed: 0,age,gender,race,file
187,116.0,female,indian,D:\age_data\archive\UTKFace\116_1_3_2017012013...
186,116.0,female,asian,D:\age_data\archive\UTKFace\116_1_2_2017011222...
185,116.0,female,white,D:\age_data\archive\UTKFace\116_1_0_2017012013...
184,116.0,female,white,D:\age_data\archive\UTKFace\116_1_0_2017011221...
183,115.0,female,black,D:\age_data\archive\UTKFace\115_1_1_2017011221...
...,...,...,...,...
2154,1.0,male,others,D:\age_data\archive\UTKFace\1_0_4_201612211949...
2155,1.0,male,others,D:\age_data\archive\UTKFace\1_0_4_201612211950...
2156,1.0,male,others,D:\age_data\archive\UTKFace\1_0_4_201612211950...
2157,1.0,male,others,D:\age_data\archive\UTKFace\1_0_4_201612211950...


In [26]:
cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
cascadef = cv2.CascadeClassifier(cascade_path)
file_names = os.listdir(dataset_folder_name)
images = []
delete_counts = []
correct_count = 0
i_list = np.arange(1, 11)
for file_name in file_names:
    file_path = os.path.join(dataset_folder_name, file_name)
    img = cv2.imread(file_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    for i in i_list:
        delete_count = 0
        face = cascadef.detectMultiScale(img, scaleFactor = 1.1, minNeighbors= i)
        if len(face) == 0:
            delete_count += 1
        else:
            correct_count += 1
    delete_counts.append(delete_count)


plt.plot(i_list, delete_counts)
plt.xlim([0, 11])
plt.ylim([min(delete_counts), max(delete_counts)])
plt.show()


In [None]:
img = cv2.imread('D:/age_data/archive/UTKFace/28_1_2_20170116164716297.jpg.chip.jpg')
if img is None:
    print("fail")
face = cascadef.detectMultiScale(img, scaleFactor = 1.1, minNeighbors= 1)
print(face)
for (x, y, w, h) in face:
    cv2.rectangle(img, (x, y, w, h), (0, 0, 255), 2)
cv2.imshow('face', img)
cv2.waitKey()
cv2.destroyAllWindows()

In [36]:
import os
import cv2
cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
cascadef = cv2.CascadeClassifier(cascade_path)
deleted_count = 0

file_names = os.listdir(dataset_folder_name)

images = []

for file_name in file_names:
    file_path = os.path.join(dataset_folder_name, file_name)
    
    image = cv2.imread(file_path)
    
    if cv2.error is not None:
        images.append(image)
        face = cascadef.detectMultiScale(image = image, scaleFactor=1.11, minNeighbors=2)
        if len(face) == 0:
            os.remove(file_path)
            deleted_count += 1
            print(f"얼굴 인식 실패 -> 삭제 : {file_name}")

        else:
            print(f"얼굴 인식 완료 : {file_name}")
        
    else:
        print(f"이미지 파일을 불러오지 못했습니다: {file_name}")
        os.remove(file_name)

print(f"{len(images)}개의 이미지를 불러왔습니다.")


23707개의 이미지를 불러왔습니다.
