In [1]:
import pandas as pd
import numpy as np
import os
import time
import random
from IPython.display import clear_output

In [18]:
root = '/home/MIBS/facenet'
src_csv = os.path.join(root, 'embedding', 'embedding_db.csv')
target_folder = os.path.join(root, 'embedding', 'similarity')
print('File exists: {}'.format(os.path.exists(src_csv)))

File exists: True


## Calculate similarity

In [3]:
df = pd.read_csv(src_csv, index_col=0)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
1014231988_231812.jpg,0.979932,1.165446,-0.473118,-1.260066,1.281455,1.235292,0.416252,-1.730684,0.225623,-0.415296,...,-0.662208,1.302117,0.132062,-0.226168,4.22975,0.078171,0.353426,0.178355,-0.249966,1.245588
1016001172_437323.jpg,0.317535,0.756705,0.448584,-0.463596,-0.664316,0.000986,0.072637,-1.550876,-0.694913,1.189381,...,-0.465162,1.695829,0.569423,-0.617782,0.611059,-1.409929,1.304691,2.18219,1.435756,0.112308
1233691482_173195.jpg,0.53499,1.190256,-0.132376,0.132695,-2.168387,-0.814362,1.084031,-2.642098,-1.689783,1.796565,...,0.001601,1.429232,1.29808,-0.644005,1.261172,-1.201658,-0.550647,2.059888,0.134634,-1.056122
80242521_228815.jpg,-0.805889,-0.008396,-1.574383,0.220078,1.359017,0.283905,0.375282,1.105475,0.246123,0.601768,...,-1.495565,-0.446212,1.305392,2.068125,-0.1776,-1.655504,0.347487,-0.944348,-1.652171,0.839394
1014182770_305619.jpg,-0.735095,0.664499,-2.234327,-0.682601,1.611458,-1.040077,0.273068,1.044876,-0.319821,1.355419,...,-0.570562,2.156091,0.493787,1.148258,1.11108,-0.781215,-1.784159,-1.043208,-0.063037,0.789947


In [5]:
df.sort_index(inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
1000046113_190136.jpg,1.392769,0.879834,-0.763906,-0.508705,-1.033225,0.290156,1.886441,-1.048308,-1.392442,1.479466,...,0.597451,0.190999,1.907559,0.922785,1.444845,-1.740782,-1.088284,1.870509,0.991341,-1.333938
1000046113_190137.jpg,1.826192,0.653667,-0.478559,-0.217734,-0.800203,0.86273,1.8722,-0.809115,-1.0834,1.541001,...,1.122302,0.055444,1.35187,0.574312,1.364401,-1.884376,-1.386176,1.778188,1.425368,-0.762503
1000046113_190138.jpg,1.76841,0.490102,-0.342407,-0.312462,-0.460354,0.890619,1.795373,-0.549559,-1.011468,1.288292,...,0.852353,-0.104129,1.345418,0.458302,1.500188,-1.843293,-1.590533,1.587266,1.413511,-0.662049
1000046113_190139.jpg,1.914628,0.405558,-0.441098,-0.523989,-0.455395,0.995524,1.436189,-0.323269,-0.878922,1.308451,...,0.844567,0.081813,1.408591,0.603813,1.322429,-1.89542,-1.571432,1.474548,1.458241,-0.84215
1000046113_190140.jpg,1.864757,0.310825,-0.658235,-0.542811,-0.540598,0.635829,1.287658,-0.42232,-1.408446,1.763298,...,0.601927,0.40554,1.381792,0.847335,1.20318,-2.053343,-1.515368,1.766902,1.447661,-0.713273


In [6]:
users = {}
for user_img in list(df.index):
    user = user_img.split('.')[0].split('_')[0]
    users.setdefault(user, []).append(user_img)

In [7]:
len(users.keys())

892

In [8]:
selected_users = []
random_sample = 10
for idx, user_id in enumerate(users.keys()):
    if len(users[user_id]) >= random_sample:
        selected_users.extend(random.sample(users[user_id], random_sample))

In [9]:
len(selected_users)

8860

In [10]:
for selected in selected_users:
    vector = np.array(df.loc[selected])
    print(selected, vector.shape)
    break

1000046113_190517.jpg (128,)


## Calculate similarity between samples

In [11]:
def findCosineSimilarity(source_representation, test_representation):
  a = np.matmul(np.transpose(source_representation), test_representation)
  b = np.sum(np.multiply(source_representation, source_representation))
  c = np.sum(np.multiply(test_representation, test_representation))
  return 1 - (a / (np.sqrt(b) * np.sqrt(c)))

In [None]:
similarity_dict = {}

for idx, selected_1 in enumerate(selected_users):
    print('Processing {}/{}: {}'.format(idx + 1, len(selected_users), selected_1))
    vector_1 = np.array(df.loc[selected_1])
    user_1 = selected_1.split('.')[0].split('_')[0]
    sub_dict = {}
    start_ = time.time()
    for selected_2 in selected_users:
        vector_2 = np.array(df.loc[selected_2])
        user_2 = selected_2.split('.')[0].split('_')[0]
        score = round(findCosineSimilarity(vector_1, vector_2), 6)
        sub_dict[selected_2] = score
    end_ = time.time()
    similarity_dict[selected_1] = sub_dict
    print('Required time {}s'.format(end_ - start_))
    if (idx + 1) % 100 == 0:
        clear_output(wait=True)

Processing 101/8860: 1000722014_448165.jpg
Required time 1.822822093963623s
Processing 102/8860: 1000722014_448457.jpg
Required time 1.9205822944641113s
Processing 103/8860: 1000722014_448280.jpg
Required time 1.965406894683838s
Processing 104/8860: 1000722014_448551.jpg
Required time 2.5086240768432617s
Processing 105/8860: 1000722014_448295.jpg
Required time 2.7963688373565674s
Processing 106/8860: 1000722014_448206.jpg
Required time 2.7865283489227295s
Processing 107/8860: 1000722014_448194.jpg
Required time 2.880100727081299s
Processing 108/8860: 1000722014_448537.jpg
Required time 2.5202362537384033s
Processing 109/8860: 1000722014_448287.jpg
Required time 1.8013970851898193s
Processing 110/8860: 1000722014_448303.jpg
Required time 1.8369369506835938s
Processing 111/8860: 1000797205_253953.jpg
Required time 1.993222951889038s
Processing 112/8860: 1000797205_253991.jpg
Required time 2.148491382598877s
Processing 113/8860: 1000797205_254253.jpg
Required time 2.8331854343414307s
Proc

In [None]:
df_similarity = pd.DataFrame.from_dict(similarity_dict)
df_similarity.head()

In [None]:
df_similarity

In [None]:
df_similarity.to_csv(os.path.join(target_folder, 'similarity_sample.csv'))