In [None]:
import os
import numpy as np

from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR, UTIL


In [None]:
face = join_face_df(DTFR, "aihub_family")  # 16.1s

In [None]:
TASK_CATEGORY = "train"
valid_uuids = read_split(TASK_CATEGORY)
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")
x_valid.keys()

In [None]:
# Sample group
# x_valid.groupby('target').index.apply(list).to_frame().head()
# x_valid.groupby(['family_id', 'personal_id', 'category']).index.apply(list).to_frame().head(10)


In [None]:
np.random.seed(22)
NUM_FOLDS = 10
NUM_PAIRS = 300
CATEGORY = "Age"
target_pair = f"pairs/{TASK_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)

is_family = x_valid.category == CATEGORY
family_valid = x_valid[is_family]
idx_family_valid = family_valid.groupby("target").index.apply(list).to_frame()

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for n in range(NUM_FOLDS):
        # matched
        matched_sample = idx_family_valid.sample(
            n=300, replace=False, random_state=n
        ).sort_values("target")
        for key, value in matched_sample.iterrows():
            idxs = value.loc["index"]
            selected = np.random.choice(idxs, size=2, replace=False)
            f.write(f"{key:8s}\t{valid_uuids[selected[0]]}\t{valid_uuids[selected[1]]}")
            f.write("\n")

        # mismatched
        for i in range(300):
            mismatched_sample = idx_family_valid.sample(
                n=2, replace=False, random_state=n * 1000 + i
            ).sort_values("target")
            sampled = [
                [key, np.random.choice(value.loc["index"], replace=False)]
                for key, value in mismatched_sample.iterrows()
            ]
            target_a, idx_a, target_b, idx_b = np.array(sampled).flatten().tolist()
            uuid_a, uuid_b = valid_uuids[int(idx_a)], valid_uuids[int(idx_b)]
            f.write(f"{target_a:<8}\t{uuid_a}\t{target_b:<8}\t{uuid_b}")
            f.write("\n")

# Get New Pairs

In [None]:
import pandas as pd
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR
import itertools

In [None]:
face = join_face_df(DTFR, "aihub_family")

In [None]:
TASK_CATEGORY = "valid"
valid_uuids = read_split(TASK_CATEGORY)
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")

In [None]:
df = x_valid.groupby(["family_id", "age_group", "gender", "personal_id"]).agg(list)[
    ["index"]
]

# 인덱스 쌍을 저장할 빈 리스트를 생성합니다.
index_pairs = []

# 각 그룹에 대해 인덱스 쌍을 생성합니다.
for _, group in df.groupby(["family_id", "age_group", "gender"]):
    index_list = group["index"].tolist()

    # 그룹의 인덱스 리스트에 두 개 이상의 원소가 있는 경우에만 조합을 생성합니다.
    if len(index_list) >= 2:
        for candidate in itertools.combinations(index_list, 2):
            for pair in itertools.product(*candidate):
                index_pairs.append([*group.index[0][:3], pair])

# 인덱스 쌍을 출력합니다.
pd.DataFrame(index_pairs, columns=["family_id", "age_group", "gender", "pairs"])

## CASE1: 가족 관계에 있는 얼굴쌍

In [None]:
import os
import numpy as np
import pandas as pd
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR
import itertools

In [None]:
face = join_face_df(DTFR, "aihub_family")

In [None]:
np.random.seed(22)
NUM_FOLDS = 10
NUM_PAIRS = 300

In [None]:
CATEGORY = "CASE1"
TASK_CATEGORY = "test"
target_pair = f"pairs/{TASK_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
valid_uuids = read_split(TASK_CATEGORY)
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")

In [None]:
df = x_valid.groupby(["family_id", "personal_id"]).agg(list)[["index"]]

In [None]:
# 각 그룹에 대해 인덱스 쌍을 생성합니다.
total_matched_list = []
for index, value in df.itertuples():
    for candidate in itertools.combinations(value, 2):
        total_matched_list.append([index[0], candidate])
total_matched_pairs = pd.DataFrame(total_matched_list, columns=["family_id", "pairs"])
selected_matched_pairs = total_matched_pairs.sample(
    n=3000, replace=False, random_state=22
)

# 인덱스 쌍을 저장할 빈 리스트를 생성합니다.
total_mismatched_list = []

# 각 그룹에 대해 인덱스 쌍을 생성합니다.
for _, group in df.groupby(["family_id"]):
    index_list = group["index"].tolist()

    # 그룹의 인덱스 리스트에 두 개 이상의 원소가 있는 경우에만 조합을 생성합니다.
    if len(index_list) >= 2:
        for candidate in itertools.combinations(index_list, 2):
            for pair in itertools.product(*candidate):
                total_mismatched_list.append([*group.index[0][:1], pair])

# 인덱스 쌍을 출력합니다.
total_mismatched_pairs = pd.DataFrame(
    total_mismatched_list, columns=["family_id", "pairs"]
)
selected_mismatched_pairs = total_mismatched_pairs.sample(
    n=3000, replace=False, random_state=22
)

In [None]:
dfs_matched = [
    group
    for _, group in selected_matched_pairs.groupby(
        np.arange(len(selected_matched_pairs)) // NUM_PAIRS
    )
]
dfs_mismatched = [
    group
    for _, group in selected_mismatched_pairs.groupby(
        np.arange(len(selected_mismatched_pairs)) // NUM_PAIRS
    )
]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for df_matched, df_mismatched in zip(dfs_matched, dfs_mismatched):
        for row in df_matched.itertuples():
            idx1, idx2 = row.pairs
            target = x_valid.iloc[idx1].target
            assert target == x_valid.iloc[idx2].target
            name1 = x_valid.iloc[idx1].name
            name2 = x_valid.iloc[idx2].name
            f.write(f"{target:8s}\t{name1}\t{name2}\n")

        for row in df_mismatched.itertuples():
            idx1, idx2 = row.pairs
            target1 = x_valid.iloc[idx1].target
            target2 = x_valid.iloc[idx2].target
            assert x_valid.iloc[idx1].family_id == x_valid.iloc[idx2].family_id
            assert target1 != target2
            name1 = x_valid.iloc[idx1].name
            name2 = x_valid.iloc[idx2].name
            f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

## CASE1C

In [None]:
from itertools import combinations
import numpy as np

In [None]:
CATEGORY = "CASE1C"
DIR_CATEGORY = "test"
TASK_CATEGORY = "test"

target_pair = f"pairs/{DIR_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_family_id = x_valid.family_id.unique()
unique_family_id_total_pairs = list(combinations(unique_family_id, 2))
index_list = np.random.choice(len(unique_family_id_total_pairs), 6000, replace=True)
selected_family_pairs = np.array(unique_family_id_total_pairs)[index_list]
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for fids in selected_family_pairs:

        def get_target_and_name(family_id, df=x_valid):
            row = df[df.family_id == family_id].sample()
            target, name = row.target.item(), row.iloc[0].name
            return target, name

        fid1, fid2 = fids
        target1, name1 = get_target_and_name(fid1)
        target2, name2 = get_target_and_name(fid2)
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

`pairs/test/pairs_CASE1C.txt`
```markdown
10 300
F0836-M 	7649eabb-da97-4bd6-9e66-adeee88c69cc	F0900-D 	ed80801a-5c34-4a6a-bb8f-20deb0ca9ca4
F0891-S 	749c628a-cb31-4179-9b26-19ccfa5f6018	F0895-D 	e3cfbe54-71d4-461c-861e-bb058ed594c9
F0804-D 	9d5d61c5-f8d9-485c-a912-f156e27ebc3f	F0867-S3	f94cd228-762b-4e0c-a3ce-a50eff6cb049
F0873-M 	6dc0dfb0-52c6-40ee-9f44-fb7a546560fc	F0889-D 	326b1e97-d796-4af9-982d-7ec73bff9c10
```

## CASE2:

In [None]:
import os
import itertools
import pandas as pd
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR

In [None]:
face = join_face_df(DTFR, "aihub_family")


In [None]:
NUM_FOLDS = 10
NUM_PAIRS = 300
CATEGORY = "CASE2"
DIR_CATEGORY = "temp"
TASK_CATEGORY = "test"

target_pair = f"pairs/{DIR_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
valid_uuids = read_split(TASK_CATEGORY)

In [None]:
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")
df = x_valid.groupby(["target", "age_group"]).agg(list)[['index']]

In [None]:
total_same_age_list = []  # 52310
for index, value in df.itertuples():
    for candidate in itertools.combinations(value, 2):
        total_same_age_list.append([*index, candidate])
        
total_same_age_pairs = pd.DataFrame(total_same_age_list, columns=["family_id", "age_group", "pairs"])
selected_same_age_pairs = total_same_age_pairs.sample(
    n=6000, replace=False, random_state=22
)

In [None]:
# SAME personal_id & SAME age_group
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for row in selected_same_age_pairs.itertuples():
        idx1, idx2 = row.pairs
        target = x_valid.iloc[idx1].target
        age_group = x_valid.iloc[idx1].age_group
        assert age_group == x_valid.iloc[idx2].age_group
        name1 = x_valid.iloc[idx1].name
        name2 = x_valid.iloc[idx2].name
        f.write(f"{target:8s}\t{name1}\t{name2}\n")
        # break


## CASE2C:

In [None]:
NUM_FOLDS = 10
NUM_PAIRS = 300
CATEGORY = "CASE2C"
DIR_CATEGORY = "temp"
TASK_CATEGORY = "test"

target_pair = f"pairs/{DIR_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
# 인덱스 쌍을 저장할 빈 리스트를 생성합니다.
total_mismatched_list = []

# 각 그룹에 대해 인덱스 쌍을 생성합니다.
for _, group in df.groupby(["target"]):
    index_list = group["index"].tolist()
    # 그룹의 인덱스 리스트에 두 개 이상의 원소가 있는 경우에만 조합을 생성합니다.
    if len(index_list) >= 2:
        for candidate in itertools.combinations(index_list, 2):
            for pair in itertools.product(*candidate):
                total_mismatched_list.append([*group.index[0][:1], pair])

In [None]:
# 인덱스 쌍을 출력합니다.
total_mismatched_pairs = pd.DataFrame(
    total_mismatched_list, columns=["target", "pairs"]
)
selected_mismatched_pairs = total_mismatched_pairs.sample(
    n=6000, replace=False, random_state=22
)

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for row in selected_mismatched_pairs.itertuples():
        idx1, idx2 = row.pairs
        target1 = x_valid.iloc[idx1].target
        target2 = x_valid.iloc[idx2].target
        assert x_valid.iloc[idx1].age_group != x_valid.iloc[idx2].age_group
        assert target1 == target2
        name1 = x_valid.iloc[idx1].name
        name2 = x_valid.iloc[idx2].name
        f.write(f"{target1:8s}\t{name1}\t{name2}\n")

## Basic feature

In [None]:
import os
import random
from tqdm import tqdm
from itertools import combinations
import pandas as pd
import numpy as np
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR

In [None]:
NUM_FOLDS = 10
NUM_PAIRS = 300

### Gender

In [None]:
face = join_face_df(DTFR, "aihub_family")
valid_uuids = read_split("test")
x_test = face.loc[valid_uuids]
x_test = x_test.reset_index().reset_index().set_index("uuid")
df = x_test.groupby(["gender"]).agg(list)[['index']]

#### BASIC-G

In [None]:
CATEGORY = "BASIC-G"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_group = ["Male", "Female"]
index_list = np.random.choice(len(unique_group), 6000, replace=True)
selected_group = np.array(unique_group)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_group):
        pid1, pid2 = random.sample(df.loc[group]["index"], 2)
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        while target1 == target2:
            pid1, pid2 = random.sample(df.loc[group]["index"], 2)
            row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
            target1, name1, target2, name2 = (
                row1.target,
                row1.name,
                row2.target,
                row2.name,
            )
        group1, group2 = row1.gender, row2.gender
        assert target1 != target2
        assert group1 == group2
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")


#### BASIC-GC

In [None]:
CATEGORY = "BASIC-GC"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_group_total_pairs = list(combinations(["Male", "Female"], 2))
index_list = np.random.choice(len(unique_group_total_pairs), 6000, replace=True)
selected_group_pairs = np.array(unique_group_total_pairs)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_group_pairs):

        def get_target_and_name(group, df=x_test):
            row = df[df.gender == group].sample()
            target, name = row.target.item(), row.iloc[0].name
            return target, name

        def get_pairs(group):
            group1, group2 = group
            target1, name1 = get_target_and_name(group1)
            target2, name2 = get_target_and_name(group2)
            return target1, name1, target2, name2

        group1, group2 = group
        target1, name1, target2, name2 = get_pairs(group)
        while target1 == target2:
            target1, name1, target2, name2 = get_pairs(group)
        assert target1 != target2
        assert group1 != group2

        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

### Age

In [None]:
face = join_face_df(DTFR, "aihub_family")
valid_uuids = read_split("test")
x_test = face.loc[valid_uuids]
x_test = x_test.reset_index().reset_index().set_index("uuid")
df = x_test.groupby(["age_group"]).agg(list)[['index']]
df = df[df.index != "above"]

#### BASIC-A

In [None]:
CATEGORY = "BASIC-A"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_age_group = [_ for _ in "abcdefgh"]
index_list = np.random.choice(len(unique_age_group), 6000, replace=True)
selected_age_group = np.array(unique_age_group)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for age_group in tqdm(selected_age_group):
        pid1, pid2 = random.sample(df.loc[age_group]["index"], 2)
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        while target1 == target2:
            pid1, pid2 = random.sample(df.loc[age_group]["index"], 2)
            row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
            target1, name1, target2, name2 = row1.target, row1.name, row2.target, row2.name
        age_group1, age_group2 = row1.age_group, row2.age_group
        assert target1 != target2
        assert age_group1 == age_group2
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

#### BASIC-AC

In [None]:
CATEGORY = "BASIC-AC"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_age_group = [_ for _ in "abcdefgh"]
unique_age_group_total_pairs = list(combinations(unique_age_group, 2))
index_list = np.random.choice(len(unique_age_group_total_pairs), 6000, replace=True)
selected_age_group_pairs = np.array(unique_age_group_total_pairs)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for age_groups in tqdm(selected_age_group_pairs):

        def get_target_and_name(age_group, df=x_test):
            row = df[df.age_group == age_group].sample()
            target, name = row.target.item(), row.iloc[0].name
            return target, name

        def get_pairs(age_group):
            age_group1, age_group2 = age_group
            target1, name1 = get_target_and_name(age_group1)
            target2, name2 = get_target_and_name(age_group2)
            return target1, name1, target2, name2

        age_group1, age_group2 = age_groups
        target1, name1, target2, name2 = get_pairs(age_groups)
        while target1 == target2:
            target1, name1, target2, name2 = get_pairs(age_groups)
        assert target1 != target2
        assert age_group1 != age_group2
        
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

### Family

In [None]:
face = join_face_df(DTFR, "aihub_family")
valid_uuids = read_split("test")
x_test = face.loc[valid_uuids]
x_test = x_test.reset_index().reset_index().set_index("uuid")
df = x_test.groupby(["family_id"]).agg(list)[['index']]

#### BASIC-F

In [None]:
CATEGORY = "BASIC-F"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_group = x_test.family_id.unique()
index_list = np.random.choice(len(unique_group), 6000, replace=True)
selected_group = np.array(unique_group)[index_list]

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_group):
        pid1, pid2 = random.sample(df.loc[group]["index"], 2)
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        while target1 == target2:
            pid1, pid2 = random.sample(df.loc[group]["index"], 2)
            row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
            target1, name1, target2, name2 = (
                row1.target,
                row1.name,
                row2.target,
                row2.name,
            )
        group1, group2 = row1.family_id, row2.family_id
        assert target1 != target2
        assert group1 == group2
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")


#### BASIC-FC

In [None]:
CATEGORY = "BASIC-FC"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_group_total_pairs = list(combinations(unique_group, 2))
index_list = np.random.choice(len(unique_group_total_pairs), 6000, replace=True)
selected_group_pairs = np.array(unique_group_total_pairs)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_group_pairs):

        def get_target_and_name(group, df=x_test):
            row = df[df.family_id == group].sample()
            target, name = row.target.item(), row.iloc[0].name
            return target, name

        def get_pairs(group):
            group1, group2 = group
            target1, name1 = get_target_and_name(group1)
            target2, name2 = get_target_and_name(group2)
            return target1, name1, target2, name2

        group1, group2 = group
        target1, name1, target2, name2 = get_pairs(group)
        while target1 == target2:
            target1, name1, target2, name2 = get_pairs(group)
        assert target1 != target2
        assert group1 != group2

        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

## Under Family

In [None]:
import os
import random
import itertools
from tqdm import tqdm
from itertools import combinations
import pandas as pd
import numpy as np
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR

In [None]:
NUM_FOLDS = 10
NUM_PAIRS = 300

In [None]:
face = join_face_df(DTFR, "aihub_family")
valid_uuids = read_split("test")
x_test = face.loc[valid_uuids]
x_test = x_test[x_test.age_group != "above"]
x_test = x_test.reset_index().reset_index().set_index("uuid")

### Age

In [None]:
df = x_test.groupby(["age_group", "family_id", "personal_id"]).agg(list)[['index']]

In [None]:
candidates = []
for index, group in df.groupby(['age_group', 'family_id']):
    if len(group) < 2:
        continue
    for candidate in itertools.combinations(group.loc[index].index, 2):
        pid1, pid2 = candidate
        index_list1 = group.loc[(*index, pid1)].item()
        index_list2 = group.loc[(*index, pid2)].item()
        pairs = itertools.product(index_list1, index_list2)
        for pair in pairs:
            candidates.append(pair)

In [None]:
selected_pairs = random.sample(candidates, 6000)

#### FAMILY-A

In [None]:
CATEGORY = "FAMILY-A"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_pairs):
        pid1, pid2 = group
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        assert target1 != target2
        assert row1.age_group == row2.age_group
        assert row1.family_id == row2.family_id
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

#### FAMILY-CA

In [None]:
CATEGORY = "FAMILY-CA"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
df = x_test.groupby(["age_group", "family_id"]).agg(list)[['index']]
candidates = []

np.random.seed(22)
random.seed(22)

for index, group in df.groupby("age_group"):
    all_afid = list(group.index)
    for candidate in itertools.combinations(all_afid, 2):
        candidates.append(candidate)
selected_candidates = random.sample(candidates, 6000)
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for dfidx1, dfidx2 in tqdm(selected_candidates):
        idx1 = random.sample(df.loc[dfidx1].item(), 1)
        idx2 = random.sample(df.loc[dfidx2].item(), 1)
        row1 = x_test.iloc[idx1]
        row2 = x_test.iloc[idx2]
        target1, name1 = row1.target.item(), row1.index.item()
        target2, name2 = row2.target.item(), row2.index.item()
        assert target1 != target2
        assert row1.age_group.item() == row2.age_group.item()
        assert row1.family_id.item() != row2.family_id.item()
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

### Gender

In [None]:
df = x_test.groupby(["gender", "family_id", "personal_id"]).agg(list)[['index']]
df.head()

In [None]:
candidates = []
for index, group in df.groupby(['gender', 'family_id']):
    if len(group) < 2:
        continue
    for candidate in itertools.combinations(group.loc[index].index, 2):
        pid1, pid2 = candidate
        index_list1 = group.loc[(*index, pid1)].item()
        index_list2 = group.loc[(*index, pid2)].item()
        pairs = itertools.product(index_list1, index_list2)
        for pair in pairs:
            candidates.append(pair)

In [None]:
selected_pairs = random.sample(candidates, 6000)

#### FAMILY-G

In [None]:
CATEGORY = "FAMILY-G"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_pairs):
        pid1, pid2 = group
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        assert target1 != target2
        assert row1.gender == row2.gender
        assert row1.family_id == row2.family_id
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

#### FAMILY-CG

In [None]:
CATEGORY = "FAMILY-CG"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
df = x_test.groupby(["gender", "family_id"]).agg(list)[['index']]
candidates = []

np.random.seed(22)
random.seed(22)

for index, group in df.groupby("gender"):
    all_gfid = list(group.index)
    for candidate in itertools.combinations(all_gfid, 2):
        candidates.append(candidate)
selected_candidates = random.sample(candidates, 6000)
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for dfidx1, dfidx2 in tqdm(selected_candidates):
        idx1 = random.sample(df.loc[dfidx1].item(), 1)
        idx2 = random.sample(df.loc[dfidx2].item(), 1)
        row1 = x_test.iloc[idx1]
        row2 = x_test.iloc[idx2]
        target1, name1 = row1.target.item(), row1.index.item()
        target2, name2 = row2.target.item(), row2.index.item()
        assert target1 != target2
        assert row1.gender.item() == row2.gender.item()
        assert row1.family_id.item() != row2.family_id.item()
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

### Age/Gender

In [None]:
df = x_test.groupby(["gender", "age_group", "family_id", "personal_id"]).agg(list)[['index']]
df.head()

In [None]:
candidates = []
for index, group in df.groupby(['gender', "age_group", 'family_id']):
    if len(group) < 2:
        continue
    for candidate in itertools.combinations(group.loc[index].index, 2):
        pid1, pid2 = candidate
        index_list1 = group.loc[(*index, pid1)].item()
        index_list2 = group.loc[(*index, pid2)].item()
        pairs = itertools.product(index_list1, index_list2)
        for pair in pairs:
            candidates.append(pair)

In [None]:
selected_pairs = random.sample(candidates, 6000)

#### FAMILY-AG

In [None]:
CATEGORY = "FAMILY-AG"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_pairs):
        pid1, pid2 = group
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        assert target1 != target2
        assert row1.gender == row2.gender
        assert row1.age_group == row2.age_group
        assert row1.family_id == row2.family_id
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

#### FAMILY-CAG

In [None]:
CATEGORY = "FAMILY-CAG"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
df = x_test.groupby(["gender", "age_group", "family_id"]).agg(list)[["index"]]
candidates = []

np.random.seed(22)
random.seed(22)

for index, group in df.groupby(["gender", "age_group"]):
    all_gfid = list(group.index)
    for candidate in itertools.combinations(all_gfid, 2):
        candidates.append(candidate)
selected_candidates = random.sample(candidates, 6000)
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for dfidx1, dfidx2 in tqdm(selected_candidates):
        idx1 = random.sample(df.loc[dfidx1].item(), 1)
        idx2 = random.sample(df.loc[dfidx2].item(), 1)
        row1 = x_test.iloc[idx1]
        row2 = x_test.iloc[idx2]
        target1, name1 = row1.target.item(), row1.index.item()
        target2, name2 = row2.target.item(), row2.index.item()
        assert target1 != target2
        assert row1.gender.item() == row2.gender.item()
        assert row1.age_group.item() == row2.age_group.item()
        assert row1.family_id.item() != row2.family_id.item()
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

# Validation

In [None]:
%cd "/home/jupyter/family-photo-tree"
%pwd

from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from torch.nn.modules.distance import PairwiseDistance
from fpt.path import DATA
from fpt.model import Model
from fpt.config import cfg
from fpt.dataset import AIHubDataset
from fpt.logger import initialize_wandb
from fpt.utils import log_verification_output
from fpt.transform import aihub_valid_transforms
from facenet.validate_aihub import validate_aihub


In [None]:
# logger
wandb_logger = initialize_wandb(cfg)

In [None]:
validate_task = "CASE1C"
checkpoint = "230529_0140"
best_distance = 28.465
cfg.project_name = "log_test_validation"

# dataloader
aihub_pairs_case1c_dataset = AIHubDataset(
    dir=DATA / "face-image/test_aihub_family",
    pairs_path=DATA / f"pairs/test/pairs_{validate_task.upper()}.txt",
    transform=aihub_valid_transforms,
)
test_loader = DataLoader(aihub_pairs_case1c_dataset, batch_size=32)

# model
model = Model(cfg)
model_path = f"/home/jongphago/family-photo-tree/work_dirs/aihub_r50_onegpu/{checkpoint}_ArcFace/model.pt"
model.load_embedding(path=model_path)

# distance_metric
l2_distance = PairwiseDistance(p=2)

In [None]:
out = 0
for a, b, label in tqdm(test_loader):
    output_a = model.embedding(a.cuda())
    output_b = model.embedding(b.cuda())
    distance = l2_distance.forward(output_a, output_b)  # Euclidean distance
    result = torch.eq(distance.cpu().detach() < best_distance, label)
    out += result.sum().detach()

print(f"{out / len(test_loader.dataset):4.2%}")

In [None]:
validate_output = validate_aihub(
    model.embedding, test_loader, "r50", 1, task=validate_task
)
log_verification_output(validate_output, wandb_logger, validate_task.capitalize(), 0)