In [2]:
import pandas as pd
from pathlib import Path


In [2]:
celeba_identity_path = Path('../../../Dataset/identity_CelebA.txt')
celeba_attributes_path = Path('../../../Dataset/list_attr_celeba.txt')
attr_weights_path = Path('../../../Dataset/attr_weights.txt')
triplets_path = Path('../../../Dataset/triplets.txt')


In [3]:
df_identity = pd.read_csv(celeba_identity_path, sep = " ", names=["image", "identity"], dtype=str)
df_identity.head()


Unnamed: 0,image,identity
0,000001.jpg,2880
1,000002.jpg,2937
2,000003.jpg,8692
3,000004.jpg,5805
4,000005.jpg,9295


In [4]:
df_attributes = pd.read_csv(celeba_attributes_path, sep = "\s+", dtype=str)
df_attributes.head()

Unnamed: 0,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,Blond_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,-1,1
000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
000003.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,1
000004.jpg,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,1,-1,1,1,-1,1
000005.jpg,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1


In [5]:
attr_weights = pd.read_csv(attr_weights_path, sep="\s+", dtype=str)
attr_weights.head()

Unnamed: 0,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,Blond_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,2,1,1,1,5,2,1,1,5,5,...,2,2,2,2,2,3,2,2,2,3


In [6]:
df_attributes['image'] = df_attributes.index
df_merged = df_attributes.merge(df_identity, on='image')
df_merged.index = df_merged['image']

In [7]:
def petr_similarity(first_image, second_image, coef=1):
    max_possible_result = 0
    result = 0
    for attr_first, attr_second, weight in zip(df_attributes.loc[first_image,:], df_attributes.loc[second_image,:], attr_weights.iloc[0,:]):
        max_possible_result += int(weight)**float(coef)
        result += (int(weight) * (abs(int(attr_first) + int(attr_second)))/2)**float(coef)
    return result/max_possible_result

In [8]:
def create_n_triplets(df_identity, n):
    triplets = []
    triplet_count = 0
    for triplet in range(n):
        try:
            #Create an anchor
            anchor_id = df_identity['identity'].sample(1, replace=True).to_string(index=False)
            #print(anchor_id)

            #Find anchor and positive, but keep it only if it is not the same picture
            keep_going = True
            while(keep_going):
                anchor = df_merged[df_merged['identity'] == anchor_id].sample(1)
                positive = df_merged[df_merged['identity'] == anchor_id].sample(1)
                #print(anchor['image'].to_string(index=False))
                #print(positive['image'].to_string(index=False))
                keep_going = anchor['image'].to_string(index=False) == positive['image'].to_string(index=False)

            #print(keep_going)

            ####    
            #Params for negative picture
            ####
            keep_going = True

            #Cycle counter
            count = 0

            #Best one found so far
            best_negative = ""

            #Similarity of the best one found so far
            best_similarity = 0

            #How similar it has to be
            similarity_treshold = 0.9

            #After how many attempts it quits and goes on with the best one found so far
            attempt_limit = 10

            while(keep_going):
                #Try to find picture with a different identity than the anchor
                negative = df_merged.loc[(df_merged['identity'] != anchor_id) \
                                    & (df_merged['Attractive'] == anchor['Attractive'][0]) \
                                    & (df_merged['Bald'] == anchor['Bald'][0]) \
                                    & (df_merged['Black_Hair'] == anchor['Black_Hair'][0]) \
                                    & (df_merged['Blond_Hair'] == anchor['Blond_Hair'][0]) \
                                    & (df_merged['Brown_Hair'] == anchor['Brown_Hair'][0]) \
                                    & (df_merged['Heavy_Makeup'] == anchor['Heavy_Makeup'][0]) \
                                    & (df_merged['Mustache'] == anchor['Mustache'][0]) \
                                    & (df_merged['No_Beard'] == anchor['No_Beard'][0]) \
                                    & (df_merged['Receding_Hairline'] == anchor['Receding_Hairline'][0]) \
                                    & (df_merged['Wearing_Hat'] == anchor['Wearing_Hat'][0]) \
                                    & (df_merged['Young'] == anchor['Young'][0]) \
                                    & (df_merged['Bald'] == anchor['Bald'][0])] \
                                    .sample(1, replace=True)
                negative_image = negative['image'][0]
                similarity = petr_similarity(anchor['image'][0], negative['image'][0])

                #print('anchor: ' + anchor['image'][0])
                #print('negative: ' + negative['image'][0])
                #print('similarity: ' + str(similarity))

                #Storing the best picture we found
                if(similarity >= best_similarity):
                    best_negative = negative
                    best_similarity = similarity

                #print(str(count) + ', ' + str(negative))

                #Break if we find a pic that is similar enough
                if(similarity > similarity_treshold):
                    keep_going = False

                #Break if we reached the limit of attempts. The best negative pic will be kept.
                elif(count > attempt_limit):
                    keep_going = False
                    negative = best_negative
                    print('limit reached, similarity: ' + str(best_similarity))

                #Increment cycle counter
                count += 1

            anchor = '../../../Dataset/img_celeba_cropped/' + anchor['image'][0]
            positive = '../../../Dataset/img_celeba_cropped/' + positive['image'][0]
            negative = '../../../Dataset/img_celeba_cropped/' + negative['image'][0]

            triplets.append([anchor, positive, negative])
            triplet_count += 1
            print(triplet_count)
        except:
            print('error')
    return pd.DataFrame(triplets, columns=['anchor', 'positive', 'negative'])

In [None]:
count = 500
for i in range(count):
    print()
    print('Batch ' + str(i) + '/' + str(count))
    triplets_new = create_n_triplets(df_identity, 100)
    triplets_new.to_csv(triplets_path, mode='a', header=False, index=False)


Batch 0/500
1
2
3
limit reached, similarity: 0.8627450980392157
4
limit reached, similarity: 0.8725490196078431
5
limit reached, similarity: 0.8725490196078431
6
7
8
9
10
11
12
13
14
15
16
17
18
19
error
20
21
22
23
24
25
26
27
28
error
29
30
31
32
33
34


In [1]:
triplets = pd.read_csv(triplets_path)
triplets

NameError: name 'pd' is not defined

In [None]:
positive_pair = [triplets.iloc[0]['anchor'], triplets.iloc[i]['positive'], 1]
negative_pair = [triplets.iloc[0]['anchor'], triplets.iloc[i]['negative'], 0]
print(positive_pair)

pairs = pd.DataFrame(columns=['anchor', 'second', 'same'])
for i in range(20000):
    positive_pair = [triplets.iloc[i]['anchor'], triplets.iloc[i]['positive'], 1]
    negative_pair = [triplets.iloc[i]['anchor'], triplets.iloc[i]['negative'], 0]
    pairs.loc[2*i] = positive_pair
    pairs.loc[2*i+1] = negative_pair
    print(i)
pairs
    

In [None]:
pairs.to_csv('../../../Dataset/pairs.txt', index=False)