In [None]:
face = join_face_df(DTFR, "aihub_family")  # 16.1s

In [None]:
import os
import numpy as np

from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR, UTIL


In [None]:
TASK_CATEGORY = "train"
valid_uuids = read_split(TASK_CATEGORY)
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")
x_valid.keys()

In [None]:
# Sample group
# x_valid.groupby('target').index.apply(list).to_frame().head()
# x_valid.groupby(['family_id', 'personal_id', 'category']).index.apply(list).to_frame().head(10)


In [None]:
np.random.seed(22)
NUM_FOLDS = 10
NUM_PAIRS = 300
CATEGORY = "Age"
target_pair = f"pairs/{TASK_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)

is_family = x_valid.category == CATEGORY
family_valid = x_valid[is_family]
idx_family_valid = family_valid.groupby("target").index.apply(list).to_frame()

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for n in range(NUM_FOLDS):
        # matched
        matched_sample = idx_family_valid.sample(
            n=300, replace=False, random_state=n
        ).sort_values("target")
        for key, value in matched_sample.iterrows():
            idxs = value.loc["index"]
            selected = np.random.choice(idxs, size=2, replace=False)
            f.write(f"{key:8s}\t{valid_uuids[selected[0]]}\t{valid_uuids[selected[1]]}")
            f.write("\n")

        # mismatched
        for i in range(300):
            mismatched_sample = idx_family_valid.sample(
                n=2, replace=False, random_state=n * 1000 + i
            ).sort_values("target")
            sampled = [
                [key, np.random.choice(value.loc["index"], replace=False)]
                for key, value in mismatched_sample.iterrows()
            ]
            target_a, idx_a, target_b, idx_b = np.array(sampled).flatten().tolist()
            uuid_a, uuid_b = valid_uuids[int(idx_a)], valid_uuids[int(idx_b)]
            f.write(f"{target_a:<8}\t{uuid_a}\t{target_b:<8}\t{uuid_b}")
            f.write("\n")

# Get New Pairs

In [None]:
import pandas as pd
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR
import itertools

In [None]:
face = join_face_df(DTFR, "aihub_family")

In [None]:
TASK_CATEGORY = "valid"
valid_uuids = read_split(TASK_CATEGORY)
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")

In [None]:
df = x_valid.groupby(["family_id", "age_group", "gender", "personal_id"]).agg(list)[
    ["index"]
]

# 인덱스 쌍을 저장할 빈 리스트를 생성합니다.
index_pairs = []

# 각 그룹에 대해 인덱스 쌍을 생성합니다.
for _, group in df.groupby(["family_id", "age_group", "gender"]):
    index_list = group["index"].tolist()

    # 그룹의 인덱스 리스트에 두 개 이상의 원소가 있는 경우에만 조합을 생성합니다.
    if len(index_list) >= 2:
        for candidate in itertools.combinations(index_list, 2):
            for pair in itertools.product(*candidate):
                index_pairs.append([*group.index[0][:3], pair])

# 인덱스 쌍을 출력합니다.
pd.DataFrame(index_pairs, columns=["family_id", "age_group", "gender", "pairs"])

## CASE1: 가족 관계에 있는 얼굴쌍

In [None]:
import os
import numpy as np
import pandas as pd
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR
import itertools

In [None]:
face = join_face_df(DTFR, "aihub_family")

In [None]:
np.random.seed(22)
NUM_FOLDS = 10
NUM_PAIRS = 300

In [None]:
CATEGORY = "CASE1"
TASK_CATEGORY = "test"
target_pair = f"pairs/{TASK_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
valid_uuids = read_split(TASK_CATEGORY)
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")

In [None]:
df = x_valid.groupby(["family_id", "personal_id"]).agg(list)[["index"]]

In [None]:
# 각 그룹에 대해 인덱스 쌍을 생성합니다.
total_matched_list = []
for index, value in df.itertuples():
    for candidate in itertools.combinations(value, 2):
        total_matched_list.append([index[0], candidate])
total_matched_pairs = pd.DataFrame(total_matched_list, columns=["family_id", "pairs"])
selected_matched_pairs = total_matched_pairs.sample(
    n=3000, replace=False, random_state=22
)

# 인덱스 쌍을 저장할 빈 리스트를 생성합니다.
total_mismatched_list = []

# 각 그룹에 대해 인덱스 쌍을 생성합니다.
for _, group in df.groupby(["family_id"]):
    index_list = group["index"].tolist()

    # 그룹의 인덱스 리스트에 두 개 이상의 원소가 있는 경우에만 조합을 생성합니다.
    if len(index_list) >= 2:
        for candidate in itertools.combinations(index_list, 2):
            for pair in itertools.product(*candidate):
                total_mismatched_list.append([*group.index[0][:1], pair])

# 인덱스 쌍을 출력합니다.
total_mismatched_pairs = pd.DataFrame(
    total_mismatched_list, columns=["family_id", "pairs"]
)
selected_mismatched_pairs = total_mismatched_pairs.sample(
    n=3000, replace=False, random_state=22
)

In [None]:
dfs_matched = [
    group
    for _, group in selected_matched_pairs.groupby(
        np.arange(len(selected_matched_pairs)) // NUM_PAIRS
    )
]
dfs_mismatched = [
    group
    for _, group in selected_mismatched_pairs.groupby(
        np.arange(len(selected_mismatched_pairs)) // NUM_PAIRS
    )
]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for df_matched, df_mismatched in zip(dfs_matched, dfs_mismatched):
        for row in df_matched.itertuples():
            idx1, idx2 = row.pairs
            target = x_valid.iloc[idx1].target
            assert target == x_valid.iloc[idx2].target
            name1 = x_valid.iloc[idx1].name
            name2 = x_valid.iloc[idx2].name
            f.write(f"{target:8s}\t{name1}\t{name2}\n")

        for row in df_mismatched.itertuples():
            idx1, idx2 = row.pairs
            target1 = x_valid.iloc[idx1].target
            target2 = x_valid.iloc[idx2].target
            assert x_valid.iloc[idx1].family_id == x_valid.iloc[idx2].family_id
            assert target1 != target2
            name1 = x_valid.iloc[idx1].name
            name2 = x_valid.iloc[idx2].name
            f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

## CASE1C

In [None]:
from itertools import combinations
import numpy as np

In [None]:
CATEGORY = "CASE1C"
DIR_CATEGORY = "test"
TASK_CATEGORY = "test"

target_pair = f"pairs/{DIR_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_family_id = x_valid.family_id.unique()
unique_family_id_total_pairs = list(combinations(unique_family_id, 2))
index_list = np.random.choice(len(unique_family_id_total_pairs), 6000, replace=True)
selected_family_pairs = np.array(unique_family_id_total_pairs)[index_list]
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for fids in selected_family_pairs:

        def get_target_and_name(family_id, df=x_valid):
            row = df[df.family_id == family_id].sample()
            target, name = row.target.item(), row.iloc[0].name
            return target, name

        fid1, fid2 = fids
        target1, name1 = get_target_and_name(fid1)
        target2, name2 = get_target_and_name(fid2)
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

`pairs/test/pairs_CASE1C.txt`
```markdown
10 300
F0836-M 	7649eabb-da97-4bd6-9e66-adeee88c69cc	F0900-D 	ed80801a-5c34-4a6a-bb8f-20deb0ca9ca4
F0891-S 	749c628a-cb31-4179-9b26-19ccfa5f6018	F0895-D 	e3cfbe54-71d4-461c-861e-bb058ed594c9
F0804-D 	9d5d61c5-f8d9-485c-a912-f156e27ebc3f	F0867-S3	f94cd228-762b-4e0c-a3ce-a50eff6cb049
F0873-M 	6dc0dfb0-52c6-40ee-9f44-fb7a546560fc	F0889-D 	326b1e97-d796-4af9-982d-7ec73bff9c10
```

## CASE2:

In [None]:
import os
import itertools
import pandas as pd
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR

In [None]:
face = join_face_df(DTFR, "aihub_family")


In [None]:
NUM_FOLDS = 10
NUM_PAIRS = 300
CATEGORY = "CASE2"
DIR_CATEGORY = "temp"
TASK_CATEGORY = "test"

target_pair = f"pairs/{DIR_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
valid_uuids = read_split(TASK_CATEGORY)

In [None]:
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")
df = x_valid.groupby(["target", "age_group"]).agg(list)[['index']]

In [None]:
total_same_age_list = []  # 52310
for index, value in df.itertuples():
    for candidate in itertools.combinations(value, 2):
        total_same_age_list.append([*index, candidate])
        
total_same_age_pairs = pd.DataFrame(total_same_age_list, columns=["family_id", "age_group", "pairs"])
selected_same_age_pairs = total_same_age_pairs.sample(
    n=6000, replace=False, random_state=22
)

In [None]:
# SAME personal_id & SAME age_group
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for row in selected_same_age_pairs.itertuples():
        idx1, idx2 = row.pairs
        target = x_valid.iloc[idx1].target
        age_group = x_valid.iloc[idx1].age_group
        assert age_group == x_valid.iloc[idx2].age_group
        name1 = x_valid.iloc[idx1].name
        name2 = x_valid.iloc[idx2].name
        f.write(f"{target:8s}\t{name1}\t{name2}\n")
        # break


## CASE2C:

In [None]:
NUM_FOLDS = 10
NUM_PAIRS = 300
CATEGORY = "CASE2C"
DIR_CATEGORY = "temp"
TASK_CATEGORY = "test"

target_pair = f"pairs/{DIR_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
# 인덱스 쌍을 저장할 빈 리스트를 생성합니다.
total_mismatched_list = []

# 각 그룹에 대해 인덱스 쌍을 생성합니다.
for _, group in df.groupby(["target"]):
    index_list = group["index"].tolist()
    # 그룹의 인덱스 리스트에 두 개 이상의 원소가 있는 경우에만 조합을 생성합니다.
    if len(index_list) >= 2:
        for candidate in itertools.combinations(index_list, 2):
            for pair in itertools.product(*candidate):
                total_mismatched_list.append([*group.index[0][:1], pair])

In [None]:
# 인덱스 쌍을 출력합니다.
total_mismatched_pairs = pd.DataFrame(
    total_mismatched_list, columns=["target", "pairs"]
)
selected_mismatched_pairs = total_mismatched_pairs.sample(
    n=6000, replace=False, random_state=22
)

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for row in selected_mismatched_pairs.itertuples():
        idx1, idx2 = row.pairs
        target1 = x_valid.iloc[idx1].target
        target2 = x_valid.iloc[idx2].target
        assert x_valid.iloc[idx1].age_group != x_valid.iloc[idx2].age_group
        assert target1 == target2
        name1 = x_valid.iloc[idx1].name
        name2 = x_valid.iloc[idx2].name
        f.write(f"{target1:8s}\t{name1}\t{name2}\n")

## Basic feature

In [None]:
import os
import random
from tqdm import tqdm
from itertools import combinations
import pandas as pd
import numpy as np
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR

In [None]:
NUM_FOLDS = 10
NUM_PAIRS = 300

### Gender

In [None]:
face = join_face_df(DTFR, "aihub_family")
valid_uuids = read_split("test")
x_test = face.loc[valid_uuids]
x_test = x_test.reset_index().reset_index().set_index("uuid")
df = x_test.groupby(["gender"]).agg(list)[['index']]

#### BASIC-G

In [None]:
CATEGORY = "BASIC-G"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_group = ["Male", "Female"]
index_list = np.random.choice(len(unique_group), 6000, replace=True)
selected_group = np.array(unique_group)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_group):
        pid1, pid2 = random.sample(df.loc[group]["index"], 2)
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        while target1 == target2:
            pid1, pid2 = random.sample(df.loc[group]["index"], 2)
            row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
            target1, name1, target2, name2 = (
                row1.target,
                row1.name,
                row2.target,
                row2.name,
            )
        group1, group2 = row1.gender, row2.gender
        assert target1 != target2
        assert group1 == group2
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")


#### BASIC-GC

In [None]:
CATEGORY = "BASIC-GC"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_group_total_pairs = list(combinations(["Male", "Female"], 2))
index_list = np.random.choice(len(unique_group_total_pairs), 6000, replace=True)
selected_group_pairs = np.array(unique_group_total_pairs)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_group_pairs):

        def get_target_and_name(group, df=x_test):
            row = df[df.gender == group].sample()
            target, name = row.target.item(), row.iloc[0].name
            return target, name

        def get_pairs(group):
            group1, group2 = group
            target1, name1 = get_target_and_name(group1)
            target2, name2 = get_target_and_name(group2)
            return target1, name1, target2, name2

        group1, group2 = group
        target1, name1, target2, name2 = get_pairs(group)
        while target1 == target2:
            target1, name1, target2, name2 = get_pairs(group)
        assert target1 != target2
        assert group1 != group2

        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

### Age

In [None]:
face = join_face_df(DTFR, "aihub_family")
valid_uuids = read_split("test")
x_test = face.loc[valid_uuids]
x_test = x_test.reset_index().reset_index().set_index("uuid")
df = x_test.groupby(["age_group"]).agg(list)[['index']]
df = df[df.index != "above"]

#### BASIC-A

In [None]:
CATEGORY = "BASIC-A"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_age_group = [_ for _ in "abcdefgh"]
index_list = np.random.choice(len(unique_age_group), 6000, replace=True)
selected_age_group = np.array(unique_age_group)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for age_group in tqdm(selected_age_group):
        pid1, pid2 = random.sample(df.loc[age_group]["index"], 2)
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        while target1 == target2:
            pid1, pid2 = random.sample(df.loc[age_group]["index"], 2)
            row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
            target1, name1, target2, name2 = row1.target, row1.name, row2.target, row2.name
        age_group1, age_group2 = row1.age_group, row2.age_group
        assert target1 != target2
        assert age_group1 == age_group2
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

#### BASIC-AC

In [None]:
CATEGORY = "BASIC-AC"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_age_group = [_ for _ in "abcdefgh"]
unique_age_group_total_pairs = list(combinations(unique_age_group, 2))
index_list = np.random.choice(len(unique_age_group_total_pairs), 6000, replace=True)
selected_age_group_pairs = np.array(unique_age_group_total_pairs)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for age_groups in tqdm(selected_age_group_pairs):

        def get_target_and_name(age_group, df=x_test):
            row = df[df.age_group == age_group].sample()
            target, name = row.target.item(), row.iloc[0].name
            return target, name

        def get_pairs(age_group):
            age_group1, age_group2 = age_group
            target1, name1 = get_target_and_name(age_group1)
            target2, name2 = get_target_and_name(age_group2)
            return target1, name1, target2, name2

        age_group1, age_group2 = age_groups
        target1, name1, target2, name2 = get_pairs(age_groups)
        while target1 == target2:
            target1, name1, target2, name2 = get_pairs(age_groups)
        assert target1 != target2
        assert age_group1 != age_group2
        
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

### Family

In [None]:
face = join_face_df(DTFR, "aihub_family")
valid_uuids = read_split("test")
x_test = face.loc[valid_uuids]
x_test = x_test.reset_index().reset_index().set_index("uuid")
df = x_test.groupby(["family_id"]).agg(list)[['index']]

#### BASIC-F

In [None]:
CATEGORY = "BASIC-F"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_group = x_test.family_id.unique()
index_list = np.random.choice(len(unique_group), 6000, replace=True)
selected_group = np.array(unique_group)[index_list]

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_group):
        pid1, pid2 = random.sample(df.loc[group]["index"], 2)
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        while target1 == target2:
            pid1, pid2 = random.sample(df.loc[group]["index"], 2)
            row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
            target1, name1, target2, name2 = (
                row1.target,
                row1.name,
                row2.target,
                row2.name,
            )
        group1, group2 = row1.family_id, row2.family_id
        assert target1 != target2
        assert group1 == group2
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")


#### BASIC-FC

In [None]:
CATEGORY = "BASIC-FC"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
np.random.seed(22)
unique_group_total_pairs = list(combinations(unique_group, 2))
index_list = np.random.choice(len(unique_group_total_pairs), 6000, replace=True)
selected_group_pairs = np.array(unique_group_total_pairs)[index_list]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_group_pairs):

        def get_target_and_name(group, df=x_test):
            row = df[df.family_id == group].sample()
            target, name = row.target.item(), row.iloc[0].name
            return target, name

        def get_pairs(group):
            group1, group2 = group
            target1, name1 = get_target_and_name(group1)
            target2, name2 = get_target_and_name(group2)
            return target1, name1, target2, name2

        group1, group2 = group
        target1, name1, target2, name2 = get_pairs(group)
        while target1 == target2:
            target1, name1, target2, name2 = get_pairs(group)
        assert target1 != target2
        assert group1 != group2

        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

## Under Family

In [None]:
import os
import random
import itertools
from tqdm import tqdm
from itertools import combinations
import pandas as pd
import numpy as np
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR

In [None]:
NUM_FOLDS = 10
NUM_PAIRS = 300

In [None]:
face = join_face_df(DTFR, "aihub_family")
valid_uuids = read_split("test")
x_test = face.loc[valid_uuids]
x_test = x_test[x_test.age_group != "above"]
x_test = x_test.reset_index().reset_index().set_index("uuid")

### Age

In [None]:
df = x_test.groupby(["age_group", "family_id", "personal_id"]).agg(list)[['index']]

In [None]:
candidates = []
for index, group in df.groupby(['age_group', 'family_id']):
    if len(group) < 2:
        continue
    for candidate in itertools.combinations(group.loc[index].index, 2):
        pid1, pid2 = candidate
        index_list1 = group.loc[(*index, pid1)].item()
        index_list2 = group.loc[(*index, pid2)].item()
        pairs = itertools.product(index_list1, index_list2)
        for pair in pairs:
            candidates.append(pair)

In [None]:
selected_pairs = random.sample(candidates, 6000)

#### FAMILY-A

In [None]:
CATEGORY = "FAMILY-A"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_pairs):
        pid1, pid2 = group
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        assert target1 != target2
        assert row1.age_group == row2.age_group
        assert row1.family_id == row2.family_id
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

#### FAMILY-CA

In [None]:
CATEGORY = "FAMILY-CA"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
df = x_test.groupby(["age_group", "family_id"]).agg(list)[['index']]
candidates = []

np.random.seed(22)
random.seed(22)

for index, group in df.groupby("age_group"):
    all_afid = list(group.index)
    for candidate in itertools.combinations(all_afid, 2):
        candidates.append(candidate)
selected_candidates = random.sample(candidates, 6000)
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for dfidx1, dfidx2 in tqdm(selected_candidates):
        idx1 = random.sample(df.loc[dfidx1].item(), 1)
        idx2 = random.sample(df.loc[dfidx2].item(), 1)
        row1 = x_test.iloc[idx1]
        row2 = x_test.iloc[idx2]
        target1, name1 = row1.target.item(), row1.index.item()
        target2, name2 = row2.target.item(), row2.index.item()
        assert target1 != target2
        assert row1.age_group.item() == row2.age_group.item()
        assert row1.family_id.item() != row2.family_id.item()
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

### Gender

In [None]:
df = x_test.groupby(["gender", "family_id", "personal_id"]).agg(list)[['index']]
df.head()

In [None]:
candidates = []
for index, group in df.groupby(['gender', 'family_id']):
    if len(group) < 2:
        continue
    for candidate in itertools.combinations(group.loc[index].index, 2):
        pid1, pid2 = candidate
        index_list1 = group.loc[(*index, pid1)].item()
        index_list2 = group.loc[(*index, pid2)].item()
        pairs = itertools.product(index_list1, index_list2)
        for pair in pairs:
            candidates.append(pair)

In [None]:
selected_pairs = random.sample(candidates, 6000)

#### FAMILY-G

In [None]:
CATEGORY = "FAMILY-G"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_pairs):
        pid1, pid2 = group
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        assert target1 != target2
        assert row1.gender == row2.gender
        assert row1.family_id == row2.family_id
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

#### FAMILY-CG

In [None]:
CATEGORY = "FAMILY-CG"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
df = x_test.groupby(["gender", "family_id"]).agg(list)[['index']]
candidates = []

np.random.seed(22)
random.seed(22)

for index, group in df.groupby("gender"):
    all_gfid = list(group.index)
    for candidate in itertools.combinations(all_gfid, 2):
        candidates.append(candidate)
selected_candidates = random.sample(candidates, 6000)
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for dfidx1, dfidx2 in tqdm(selected_candidates):
        idx1 = random.sample(df.loc[dfidx1].item(), 1)
        idx2 = random.sample(df.loc[dfidx2].item(), 1)
        row1 = x_test.iloc[idx1]
        row2 = x_test.iloc[idx2]
        target1, name1 = row1.target.item(), row1.index.item()
        target2, name2 = row2.target.item(), row2.index.item()
        assert target1 != target2
        assert row1.gender.item() == row2.gender.item()
        assert row1.family_id.item() != row2.family_id.item()
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

### Age/Gender

In [None]:
df = x_test.groupby(["gender", "age_group", "family_id", "personal_id"]).agg(list)[['index']]
df.head()

In [None]:
candidates = []
for index, group in df.groupby(['gender', "age_group", 'family_id']):
    if len(group) < 2:
        continue
    for candidate in itertools.combinations(group.loc[index].index, 2):
        pid1, pid2 = candidate
        index_list1 = group.loc[(*index, pid1)].item()
        index_list2 = group.loc[(*index, pid2)].item()
        pairs = itertools.product(index_list1, index_list2)
        for pair in pairs:
            candidates.append(pair)

In [None]:
selected_pairs = random.sample(candidates, 6000)

#### FAMILY-AG

In [None]:
CATEGORY = "FAMILY-AG"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for group in tqdm(selected_pairs):
        pid1, pid2 = group
        row1, row2 = x_test.iloc[pid1], x_test.iloc[pid2]
        out = row1.target, row1.name, row2.target, row2.name
        target1, name1, target2, name2 = out
        assert target1 != target2
        assert row1.gender == row2.gender
        assert row1.age_group == row2.age_group
        assert row1.family_id == row2.family_id
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

#### FAMILY-CAG

In [None]:
CATEGORY = "FAMILY-CAG"
dir_category = "test"

target_pair = f"pairs/{dir_category}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
df = x_test.groupby(["gender", "age_group", "family_id"]).agg(list)[["index"]]
candidates = []

np.random.seed(22)
random.seed(22)

for index, group in df.groupby(["gender", "age_group"]):
    all_gfid = list(group.index)
    for candidate in itertools.combinations(all_gfid, 2):
        candidates.append(candidate)
selected_candidates = random.sample(candidates, 6000)
with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for dfidx1, dfidx2 in tqdm(selected_candidates):
        idx1 = random.sample(df.loc[dfidx1].item(), 1)
        idx2 = random.sample(df.loc[dfidx2].item(), 1)
        row1 = x_test.iloc[idx1]
        row2 = x_test.iloc[idx2]
        target1, name1 = row1.target.item(), row1.index.item()
        target2, name2 = row2.target.item(), row2.index.item()
        assert target1 != target2
        assert row1.gender.item() == row2.gender.item()
        assert row1.age_group.item() == row2.age_group.item()
        assert row1.family_id.item() != row2.family_id.item()
        f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")

# Validation

In [None]:
%cd "/home/jupyter/family-photo-tree"
%pwd

from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from torch.nn.modules.distance import PairwiseDistance
from fpt.path import DATA
from fpt.model import Model
from fpt.config import cfg
from fpt.dataset import AIHubDataset
from fpt.logger import initialize_wandb
from fpt.utils import log_verification_output
from fpt.transform import aihub_valid_transforms
from facenet.validate_aihub import validate_aihub


In [None]:
# logger
wandb_logger = initialize_wandb(cfg)

In [None]:
validate_task = "CASE1C"
checkpoint = "230529_0140"
best_distance = 28.465
cfg.project_name = "log_test_validation"

# dataloader
aihub_pairs_case1c_dataset = AIHubDataset(
    dir=DATA / "face-image/test_aihub_family",
    pairs_path=DATA / f"pairs/test/pairs_{validate_task.upper()}.txt",
    transform=aihub_valid_transforms,
)
test_loader = DataLoader(aihub_pairs_case1c_dataset, batch_size=32)

# model
model = Model(cfg)
model_path = f"/home/jongphago/family-photo-tree/work_dirs/aihub_r50_onegpu/{checkpoint}_ArcFace/model.pt"
model.load_embedding(path=model_path)

# distance_metric
l2_distance = PairwiseDistance(p=2)

In [None]:
out = 0
for a, b, label in tqdm(test_loader):
    output_a = model.embedding(a.cuda())
    output_b = model.embedding(b.cuda())
    distance = l2_distance.forward(output_a, output_b)  # Euclidean distance
    result = torch.eq(distance.cpu().detach() < best_distance, label)
    out += result.sum().detach()

print(f"{out / len(test_loader.dataset):4.2%}")

In [None]:
validate_output = validate_aihub(
    model.embedding, test_loader, "r50", 1, task=validate_task
)
log_verification_output(validate_output, wandb_logger, validate_task.capitalize(), 0)

# Visualize

In [None]:
import os
import numpy as np
import pandas as pd
import dataframe_image as dfi
from easydict import EasyDict as edict
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR, DATA

pd.set_option("display.float_format", "{:.4f}".format)

In [None]:
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc

font_path = "/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
fontprop = fm.FontProperties(fname=font_path, size=18)

# 한글 폰트 설정
font_name = fm.FontProperties(
    fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
).get_name()
rc("font", family=font_name)

In [None]:
# Set global figure background color
plt.rcParams["figure.facecolor"] = "white"

sns.set(style="white")

In [None]:
from pathlib import Path

ROOT = Path("/home/jongphago/family-photo-tree")


def savefig(target: str, extension=".png"):
    if not target.endswith(extension):
        target += extension
    image_path = ROOT / target
    dirname = os.path.dirname(image_path)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    plt.savefig(image_path, facecolor="w")

In [None]:
def add_extension(path):
    if os.path.exists(path + ".jpg"):
        return path + ".jpg"
    elif os.path.exists(path + ".png"):
        return path + ".png"
    else:
        raise RuntimeError('No file "%s" with extension png or jpg.' % path)


def get_distance_list(distance_path):
    with open(distance_path, "r") as f:
        l = f.readline().rstrip()
        best_distance = float(l.split(", ")[1])
        lines = f.readlines()
        distances = [float(l.rstrip()) for l in lines]
        assert len(distances) == 6000
    return distances, best_distance


def get_lr(df, prop_name, is_sort=False):
    new = df[[f"{prop_name}_left", f"{prop_name}_right", "distance", "correct"]].copy()
    
    if is_sort:
        sorted_lr = pd.DataFrame(
            np.sort(new[[f"{prop_name}_left", f"{prop_name}_right"]]),
            columns=[f"{prop_name}_left", f"{prop_name}_right"],
        )
        new.update(sorted_lr)

    # apply sorted function on total_list
    new["total_list"] = df[[f"{prop_name}_left", f"{prop_name}_right"]].apply(
        lambda x: sorted(x), axis=1
    )

    new["total"] = new["total_list"].apply(lambda x: ", ".join(map(str, x)))

    # set property
    best_distance = df.best_distance
    pair_name = df.pair_name
    checkpoint = df.checkpoint
    model_type = df.model_type
    setattr(new, "pair_name", pair_name)
    setattr(new, "checkpoint", checkpoint)
    setattr(new, "model_type", model_type)
    setattr(new, "best_distance", best_distance)
    return new


def makedirs(table_name):
    root = "/home/jupyter/family-photo-tree/utils/dataset"
    dir_name = os.path.dirname(table_name)
    export_dir = os.path.join(root, dir_name)
    os.makedirs(export_dir, exist_ok=True)

In [None]:
# face dataframe
face = join_face_df(DTFR, "aihub_family")
face["personal_id"] = face["personal_id"].str.replace("\d+", "", regex=True)
valid_uuids = read_split("test")

x_test = face.loc[valid_uuids]
x_test = x_test[x_test.age_group != "above"]
x_test = x_test.reset_index().reset_index().set_index("uuid")

In [None]:
def get_df(pair_name, model_type, checkpoint, x_test=x_test):
    def get_path(pair_name, model_type, checkpoint):
        distance_path = DATA / f"distance/{model_type}/{checkpoint}/{pair_name}.txt"
        pairs_path = f"pairs/test/pairs_{pair_name}.txt"
        return distance_path, pairs_path

    # get path
    distance_path, pairs_path = get_path(pair_name, model_type, checkpoint)

    # Distance array
    distances, best_distance = get_distance_list(distance_path)
    distances = np.array([distances, np.array(distances) > best_distance], dtype=int).T

    # pairs data path
    os.makedirs(os.path.dirname(pairs_path), exist_ok=True)
    aihub_dir = DATA / "face-image/test_aihub_family"

    # pairs df (6000, 6)
    pairs = []
    is_sames = []
    with open(pairs_path, "r") as f:
        for line in f.readlines()[1:]:
            pair = line.strip().split()
            pairs.append(pair)
            is_sames.append(True if len(pair) == 3 else False)

    pairs = np.array(pairs, dtype=object)
    is_sames = np.array(is_sames, dtype=np.int64)[:, np.newaxis]
    columns = ["issame", "ltarget", "luuid", "rtarget", "ruuid", "distance", "correct"]
    pairs_df = pd.DataFrame(
        np.hstack((is_sames, pairs, distances)),
        columns=columns,
    )  # (6000, 6)

    # path_df
    nrof_skipped_pairs = 0
    path_list = []
    issame_list = []
    for pair in pairs:
        if len(pair) == 3:
            path0 = add_extension(os.path.join(aihub_dir, pair[0], pair[1]))
            path1 = add_extension(os.path.join(aihub_dir, pair[0], pair[2]))
            issame = True
        elif len(pair) == 4:
            path0 = add_extension(os.path.join(aihub_dir, pair[0], pair[1]))
            path1 = add_extension(os.path.join(aihub_dir, pair[2], pair[3]))
            issame = False
        if os.path.exists(path0) and os.path.exists(
            path1
        ):  # Only add the pair if both paths exist
            path_list.append((path0, path1, issame))
            issame_list.append(issame)
        else:
            nrof_skipped_pairs += 1

    if nrof_skipped_pairs > 0:
        print("Skipped %d image pairs" % nrof_skipped_pairs)

    path_df = pd.DataFrame(
        path_list,
        columns=("image_left", "image_right", "issame"),
    )  # (6000, 3)

    # df
    temp_x_test = x_test.reset_index().set_index("index")  # (8147, 15)
    temp_merged = pd.merge(
        pairs_df,
        temp_x_test,
        left_on="luuid",
        right_on="uuid",
    )
    df = pd.merge(
        temp_merged,
        temp_x_test,
        left_on="ruuid",
        right_on="uuid",
        suffixes=["_left", "_right"],
    )
    df.correct = df.correct.astype(int)

    # set property
    setattr(df, "pair_name", pair_name)
    setattr(df, "checkpoint", checkpoint)
    setattr(df, "model_type", model_type)
    setattr(df, "best_distance", best_distance)

    return df

In [None]:
def grouped_ratio(lr, prop_name):
    grouped = lr.groupby("total").count().total_list.to_frame() / len(lr)
    grouped.columns = [f"{prop_name}_pair_ratio"]
    setattr(grouped, "prop_name", prop_name)
    return grouped

In [None]:
# capture dataframe
def capture_df(grouped, pair_name, prop_name):
    grouped_shape = '{}x{}'.format(*grouped.shape)
    image_path = f"export/{pair_name}/{prop_name}/lr-{prop_name}_pair_ratio-{grouped_shape}.png"
    makedirs(image_path)
    dfi.export(grouped, image_path, table_conversion="matplotlib")
    print(image_path)

In [None]:
model_type = "single-fr-ver-1"
checkpoint = "230529_0140"
pair_names = [
    "BASIC-G", "BASIC-GC",
    "BASIC-A", "BASIC-AC",
    "BASIC-F", "BASIC-FC",
    "FAMILY-A", "FAMILY-CA",
    "FAMILY-G", "FAMILY-CG",
    "FAMILY-AG", "FAMILY-CAG",
    # "BASIC-FN",
    # "PERSONAL-A", "PERSONAL-AC",
]

In [None]:
df_dict = edict(
    {
        pair_name.replace("-", "").lower(): get_df(pair_name, model_type, checkpoint)
        for pair_name in pair_names
    }
)

In [None]:
prop_name_dict = edict({
    'gender':'성별',
    'age_group': '나이 그룹',
    'age': '나이',
    'category': '이미지 카테고리'
})

In [None]:
# pair_list = [df_dict.basicg, df_dict.basicgc]
pair_list = [df_dict.basica, df_dict.basicac]
# pair_list = [df_dict.basicf, df_dict.basicfc]
# pair_list = [df_dict.familya, df_dict.familyca]
# pair_list = [df_dict.familyg, df_dict.familycg]
# pair_list = [df_dict.familyag, df_dict.familycag]

## EDA

### Ratio table

#### gender

In [None]:
def draw_data_ratio_table(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]
    result = []
    a = x_test.groupby(prop_name).count().label / len(x_test)
    setattr(a, "name", "x_test")
    result.append(a)
    pair_name_list = []
    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name)
        for d in ["left", "right"]:
            b = lr.groupby(f"{prop_name}_{d}").count().total / len(lr)
            setattr(b, "name", f"{sub_task.pair_name}-{d}")
            result.append(b)
        pair_name_list.append(sub_task.pair_name)
    print(f'Table | {"와 ".join(pair_name_list)}의 {kor_prop_name} 데이터 구성 비율')
    return pd.DataFrame(result, index=[r.name for r in result])

In [None]:
prop_name = 'gender'
draw_data_ratio_table(prop_name, pair_list)

#### age_group

In [None]:
prop_name = 'age_group'
draw_data_ratio_table(prop_name, pair_list)

#### category

In [None]:
prop_name = 'category'
draw_data_ratio_table(prop_name, pair_list)

### Heatmap

In [None]:
from matplotlib import font_manager
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def draw_pair_ratio_heatmap(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]

    # 한글 폰트 설정
    font_name = font_manager.FontProperties(
        fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
    ).get_name()
    plt.rc("font", family=font_name)

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))

    # Compute the global min and max to use for all heatmaps
    vmin = float("inf")
    vmax = float("-inf")

    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name, is_sort=True)
        heatmap_data = lr.groupby(
            [f"{prop_name}_left", f"{prop_name}_right"]
        ).count().correct / len(lr)
        vmin = min(vmin, heatmap_data.min())
        vmax = max(vmax, heatmap_data.max())

    pair_name_list = []
    copied = None
    for i, sub_task in enumerate(pair_list):
        lr = get_lr(sub_task, prop_name, is_sort=True)

        # DataFrame을 pivot 형태로 변환
        heatmap_data = lr.groupby(
            [f"{prop_name}_left", f"{prop_name}_right"]
        ).count().correct / len(lr)
        heatmap_data = heatmap_data.reset_index()
        heatmap_data = heatmap_data.pivot(
            f"{prop_name}_left", f"{prop_name}_right", "correct"
        )
        if copied is None:
            copied = heatmap_data.copy()

        # heatmap 그리기
        sns.heatmap(
            heatmap_data,
            cmap="YlGnBu",
            annot=True,
            fmt=".2f",
            ax=axs[i],
            vmin=vmin,
            vmax=vmax,
        )

        # x, y 축 라벨 및 타이틀 설정
        axs[i].set_title(f"[{lr.pair_name}] {kor_prop_name} 쌍 비율", fontsize=14)
        axs[i].set_xlabel(f"오른쪽 {kor_prop_name}", fontsize=12)
        axs[i].set_ylabel(f"왼쪽 {kor_prop_name}", fontsize=12)

        m = heatmap_data.stack().mean()
        d = heatmap_data.stack().std()
        print(f"평균 {m:.4f}, 표준편차 {d:.4f}")
        pair_name_list.append(sub_task.pair_name)
    print(f'Figure | {"와 ".join(pair_name_list)}의 {kor_prop_name} 데이터 쌍 비율')

    plt.tight_layout()
    plt.show()
    return copied - heatmap_data

def draw_instance_ratio_heatmap(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]

    # 한글 폰트 설정
    font_name = font_manager.FontProperties(
        fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
    ).get_name()
    plt.rc("font", family=font_name)

    result = []
    a = x_test.groupby(prop_name).count().label / len(x_test)
    setattr(a, "name", "x_test")
    result.append(a)

    pair_name_list = []
    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name)
        for d in ["left", "right"]:
            b = lr.groupby(f"{prop_name}_{d}").count().total / len(lr)
            setattr(b, "name", f"{sub_task.pair_name}-{d}")
            result.append(b)
            m, d = b.mean(), b.std()
            print(f"평균 {m:.4f}, 표준편차 {d:.4f}")
        pair_name_list.append(sub_task.pair_name)
    print(f'Figure | {"와 ".join(pair_name_list)}의 {kor_prop_name} 비율 히트맵')

    out = pd.DataFrame(result, index=[r.name for r in result])  # (5, 8)

    # seaborn heatmap 사용
    plt.figure(figsize=(10, 5))
    sns.heatmap(
        out,
        annot=True,
        cmap="YlGnBu",
        fmt=".3g",
        linewidths=0.5,
        cbar_kws={"shrink": 0.5},
    )

    # Setting labels and title
    title = f"[{','.join(pair_name_list)}] {kor_prop_name} 비율 히트맵"
    plt.title(title, fontsize=16)
    plt.xlabel(f"{kor_prop_name}", fontsize=14)
    plt.ylabel("데이터셋", fontsize=14)
    plt.tick_params(axis="y", rotation=0)

    plt.show()

#### Gender

In [None]:
prop_name = "gender"
draw_instance_ratio_heatmap(prop_name, pair_list)

In [None]:
prop_name = "gender"
draw_pair_ratio_heatmap(prop_name, pair_list)

#### age_group

In [None]:
prop_name = "age_group"
draw_instance_ratio_heatmap(prop_name, pair_list)

In [None]:
prop_name = "age_group"
draw_pair_ratio_heatmap(prop_name, pair_list)

#### Cateogry

In [None]:
prop_name = "category"
draw_instance_ratio_heatmap(prop_name, pair_list)

In [None]:
prop_name = "category"
draw_pair_ratio_heatmap(prop_name, pair_list)

### KDE

#### Age diff

In [None]:
def draw_age_diff_kde(pair_list):
    # 한글 폰트 설정
    font_name = font_manager.FontProperties(fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf").get_name()
    plt.rc("font", family=font_name)

    sns.set(style="whitegrid")  # setting seaborn style
    plt.figure(figsize=(20, 6))  # specifying figure size

    # Plotting KDE
    plt.subplot(1, 2, 1)  # subplot to plot two graphs side by side

    pair_name_list = []
    for sub_task in pair_list:
        sub_frame = abs(sub_task.age_left - sub_task.age_right)
        pair_name_list.append(sub_task.pair_name)
        sns.kdeplot(sub_frame, label=sub_task.pair_name, fill=True, alpha=0.1, cut=0)

    plt.xlabel("나이 그룹 평균 나이 차이(절대값)", fontsize=14, fontproperties=fontprop)
    plt.ylabel("밀도", fontsize=14, fontproperties=fontprop)
    plt.xlim(0)
    plt.title(f"[{','.join(pair_name_list)}] 나이 그룹 평균 나이 차이에 대한 커널 밀도 추정", fontsize=16, fontproperties=fontprop)
    plt.legend(title="pair_name", title_fontsize="13", fontsize=12)

    # Plotting Cumulative KDE
    plt.subplot(1, 2, 2)  # subplot to plot two graphs side by side

    for sub_task in pair_list:
        sub_frame = abs(sub_task.age_left - sub_task.age_right)
        sns.kdeplot(sub_frame, cumulative=True, label=sub_task.pair_name, fill=True, alpha=0.1, cut=3)

    plt.xlabel("나이 또는 나이 그룹의 평균 나이 차이(절대값)", fontsize=14, fontproperties=fontprop)
    plt.ylabel("누적 밀도", fontsize=14, fontproperties=fontprop)
    plt.xlim(0)
    plt.title(f"[{','.join(pair_name_list)}] 나이 또는 나이 그룹 평균 나이 차이에 대한 누적 커널 밀도 추정", fontsize=16, fontproperties=fontprop)
    plt.legend(title="pair_name", title_fontsize="13", fontsize=12)

    print(f"Figure | {'와 '.join(pair_name_list)}의 나이 또는 나이 그룹 평균 나이 차이에 대한 누적 커널 밀도 추정")
    # Removing top and right borders
    sns.despine()

    plt.tight_layout()  # for better layout
    plt.show()


In [None]:
draw_age_diff_kde(pair_list)

## Metric

### Accuracy table

In [None]:
def draw_accuracy_table(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]
    pair_name_list = []
    dfs = []  # out DataFrame들을 담을 리스트를 생성합니다.
    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name)
        out = lr.groupby('total')[['correct', 'distance']].mean()
        out['pair_name'] = sub_task.pair_name
        dfs.append(out)  # 리스트에 DataFrame을 추가합니다.
        pair_name_list.append(sub_task.pair_name)
    print(f'Table | {"와 ".join(pair_name_list)}의 {kor_prop_name} 정확도')
    
    # 리스트의 모든 DataFrame들을 합칩니다.
    final_df = pd.concat(dfs)  

    # 통계 행을 생성합니다.
    temp_total = final_df.groupby("pair_name").mean()
    temp_total["total"] = "total"

    print(f"정확도의 차이: {abs(np.subtract(*[i for i in temp_total.correct])):.4f}")

    # final_df와 통계행을 합칩니다.
    return pd.concat([final_df.reset_index(), temp_total.reset_index()]).groupby(
        ["pair_name", "total"]
    ).sum()

#### Gender

In [None]:
prop_name = "gender"
draw_accuracy_table(prop_name, pair_list)

#### Age_group

In [None]:
prop_name = "age_group"
draw_accuracy_table(prop_name, pair_list).sort_values('correct').iloc[:10]

#### Category

In [None]:
prop_name = "category"
draw_accuracy_table(prop_name, pair_list)

### Heatmap

#### Gender

In [None]:
def draw_metric_heatmap(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]

    # 한글 폰트 설정
    font_name = font_manager.FontProperties(fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf").get_name()
    plt.rc("font", family=font_name)

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))

    # Compute the global min and max to use for all heatmaps
    vmin = float('inf')
    vmax = float('-inf')

    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name, True)
        heatmap_data = (
            lr.groupby([f"{prop_name}_left", f"{prop_name}_right"]).sum().correct
            / lr.groupby([f"{prop_name}_left", f"{prop_name}_right"]).count().correct
        )
        vmin = min(vmin, heatmap_data.min())
        vmax = max(vmax, heatmap_data.max())

    pair_name_list = []
    copied = None
    for i, sub_task in enumerate(pair_list):
        lr = get_lr(sub_task, prop_name, True)

        # DataFrame을 pivot 형태로 변환
        heatmap_data = (
            lr.groupby([f"{prop_name}_left", f"{prop_name}_right"]).sum().correct
            / lr.groupby([f"{prop_name}_left", f"{prop_name}_right"]).count().correct
        )
        heatmap_data = heatmap_data.reset_index()
        heatmap_data = heatmap_data.pivot(f"{prop_name}_left", f"{prop_name}_right", "correct")
        if copied is None:
            copied = heatmap_data.copy()
        # heatmap 그리기
        sns.heatmap(heatmap_data, cmap="coolwarm", annot=True, fmt=".2f", ax=axs[i], vmin=vmin, vmax=vmax)
        # x, y 축 라벨 및 타이틀 설정
        axs[i].set_title(f"[{lr.pair_name}] {kor_prop_name}간 검증 정확도", fontsize=14)
        axs[i].set_xlabel(f"오른쪽 {kor_prop_name}", fontsize=12)
        axs[i].set_ylabel(f"왼쪽 {kor_prop_name}", fontsize=12)
        
        pair_name_list.append(sub_task.pair_name)
    
    print(f'Figure | {"와 ".join(pair_name_list)}의 {kor_prop_name} 검증 정확도')

    plt.tight_layout()
    plt.show()
    
    return copied - heatmap_data

In [None]:
prop_name = "gender"
draw_metric_heatmap(prop_name, pair_list)

#### Age group

In [None]:
prop_name = "age_group"
draw_metric_heatmap(prop_name, pair_list)

#### Category

In [None]:
prop_name = "category"
draw_metric_heatmap(prop_name, pair_list)

### KDE

In [None]:
from matplotlib import font_manager
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def draw_metric_kde(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]

    # 한글 폰트 설정
    font_name = font_manager.FontProperties(fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf").get_name()
    plt.rc("font", family=font_name)

    sns.set(style="whitegrid")  # setting seaborn style
    plt.figure(figsize=(20, 6))  # specifying figure size

    # Subplot 1 for lr1
    plt.subplot(1, 2, 1)
    pair_name_list = []
    sub_task = pair_list[0]  # Change this to choose the task
    lr = get_lr(sub_task, prop_name)
    pair_name_list.append(sub_task.pair_name)    
    for tag in lr.total.unique():
        sub_frame = lr[lr.total == tag].distance - lr.best_distance
        sns.kdeplot(sub_frame, label=tag, fill=True, alpha=0.1)

    plt.xlabel("거리 차이", fontsize=14, fontproperties=fontprop)
    plt.ylabel("밀도", fontsize=14, fontproperties=fontprop)
    plt.title(f"[{','.join(pair_name_list)}] {kor_prop_name}에 따른 거리 차이의 커널 밀도 추정", fontsize=16, fontproperties=fontprop)
    plt.legend(title=prop_name.capitalize(), title_fontsize="13", fontsize=12)

    # Subplot 2 for lr2
    plt.subplot(1, 2, 2)
    sub_task = pair_list[1]  # Change this to choose the task
    lr = get_lr(sub_task, prop_name)
    pair_name_list.append(sub_task.pair_name)    
    for tag in lr.total.unique():
        sub_frame = lr[lr.total == tag].distance - lr.best_distance
        sns.kdeplot(sub_frame, label=tag, fill=True, alpha=0.1)
        
    print(f'Figure | {"와 ".join(pair_name_list)}의 {kor_prop_name}에 따른 거리 차이의 커널 밀도 추정')

    plt.xlabel("거리 차이", fontsize=14, fontproperties=fontprop)
    plt.ylabel("밀도", fontsize=14, fontproperties=fontprop)
    plt.title(f"[{','.join(pair_name_list)}] {kor_prop_name}에 따른 거리 차이의 커널 밀도 추정", fontsize=16, fontproperties=fontprop)
    plt.legend(title=prop_name.capitalize(), title_fontsize="13", fontsize=12)
    plt.xlim(-15, 15)

    # Removing top and right borders
    sns.despine()

    plt.tight_layout()  # for better layout
    plt.show()


#### Gender

In [None]:
prop_name = "gender"
draw_metric_kde(prop_name, pair_list)

#### Category

In [None]:
prop_name = "category"
draw_metric_kde(prop_name, pair_list)

#### Age_group

In [None]:
def draw_metric_single_kde(prop_name, pair_list):
    sns.set(style="whitegrid")  # setting seaborn style
    plt.figure(figsize=(10, 6))  # specifying figure size

    pair_name_list = []
    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name)
        pair_name_list.append(sub_task.pair_name)    
        for tag in lr.total.unique():
            sub_frame = lr[lr.total == tag].distance - lr.best_distance
            sns.kdeplot(sub_frame, label=f"[{sub_task.pair_name}] {tag}", fill=True, alpha=0.1)


    # Setting labels and title
    plt.xlabel("거리 차이", fontsize=14, fontproperties=fontprop)
    plt.ylabel("밀도", fontsize=14, fontproperties=fontprop)
    plt.title(f"[{','.join(pair_name_list)}] 거리 차이의 커널 밀도 추정", fontsize=16, fontproperties=fontprop)
    plt.legend(title=prop_name.capitalize(), title_fontsize="13", fontsize=12)
    plt.xlim(-15, 15)

    # Removing top and right borders
    sns.despine()

    plt.show()


In [None]:
prop_name = "age_group"
# draw_metric_single_kde(prop_name, pair_list)

### Scatter

#### Age

In [None]:
from matplotlib.colors import LogNorm

In [None]:
def draw_metric_scatter(prop_name, pair_list):
    # 한글 폰트 설정
    font_name = font_manager.FontProperties(
        fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
    ).get_name()
    plt.rc("font", family=font_name)

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))

    pair_name_list = []
    for i, sub_task in enumerate(pair_list):
        lr = get_lr(sub_task, prop_name)

        # Scatter plot
        temp = pd.concat([lr, abs(lr.age_left - lr.age_right)], axis=1)
        temp.columns = temp.columns[:-1].append(pd.Index(["age_diff"]))
        selected = pd.DataFrame(
            temp.groupby("age_diff").sum().correct
            / temp.groupby("age_diff").count().correct
        ).sort_values(by="correct")

        # cmap과 norm을 설정하여 데이터의 수에 따라 색상을 조정합니다.
        cmap = plt.cm.get_cmap("PRGn")
        norm = LogNorm(
            vmin=temp.groupby("age_diff").count().correct.min(),
            vmax=temp.groupby("age_diff").count().correct.max(),
        )

        scatter = axs[i].scatter(
            selected.index,
            selected.correct,
            c=temp.groupby("age_diff").count().correct,
            cmap=cmap,
            norm=norm,
            alpha=0.7,
        )

        # colorbar 추가
        cbar = fig.colorbar(scatter, ax=axs[i])
        cbar.set_label("데이터 수", rotation=270, labelpad=15, fontsize=10)

        # x, y 축 라벨 및 타이틀 설정
        axs[i].set_xlabel("나이 또는 나이 그룹의 평균 나이 차이(절대값)", fontsize=12)
        axs[i].set_ylabel("검증 정확도", fontsize=12)
        axs[i].set_title(f"[{sub_task.pair_name}] 나이 차이에 따른 검증 정확도", fontsize=14)
        axs[i].set_xlim(0)
        axs[i].set_ylim(0.5)
        pair_name_list.append(sub_task.pair_name)

    print(f'Figure | {"와 ".join(pair_name_list)}의 나이 차이에 따른 검증 정확도')
    plt.tight_layout()
    plt.show()

In [None]:
prop_name = "age"
draw_metric_scatter(prop_name, pair_list)