In [None]:
import os
import numpy as np

from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR, UTIL


In [None]:
face = join_face_df(DTFR, "aihub_family")  # 16.1s

In [None]:
TASK_CATEGORY = "train"
valid_uuids = read_split(TASK_CATEGORY)
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")
x_valid.keys()

In [None]:
# Sample group
# x_valid.groupby('target').index.apply(list).to_frame().head()
# x_valid.groupby(['family_id', 'personal_id', 'category']).index.apply(list).to_frame().head(10)


In [None]:
np.random.seed(22)
NUM_FOLDS = 10
NUM_PAIRS = 300
CATEGORY = "Age"
target_pair = f"pairs/{TASK_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)

is_family = x_valid.category == CATEGORY
family_valid = x_valid[is_family]
idx_family_valid = family_valid.groupby("target").index.apply(list).to_frame()

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for n in range(NUM_FOLDS):
        # matched
        matched_sample = idx_family_valid.sample(
            n=300, replace=False, random_state=n
        ).sort_values("target")
        for key, value in matched_sample.iterrows():
            idxs = value.loc["index"]
            selected = np.random.choice(idxs, size=2, replace=False)
            f.write(f"{key:8s}\t{valid_uuids[selected[0]]}\t{valid_uuids[selected[1]]}")
            f.write("\n")

        # mismatched
        for i in range(300):
            mismatched_sample = idx_family_valid.sample(
                n=2, replace=False, random_state=n * 1000 + i
            ).sort_values("target")
            sampled = [
                [key, np.random.choice(value.loc["index"], replace=False)]
                for key, value in mismatched_sample.iterrows()
            ]
            target_a, idx_a, target_b, idx_b = np.array(sampled).flatten().tolist()
            uuid_a, uuid_b = valid_uuids[int(idx_a)], valid_uuids[int(idx_b)]
            f.write(f"{target_a:<8}\t{uuid_a}\t{target_b:<8}\t{uuid_b}")
            f.write("\n")

# Get New Pairs

In [None]:
import pandas as pd
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR
import itertools

In [None]:
face = join_face_df(DTFR, "aihub_family")

In [None]:
TASK_CATEGORY = "valid"
valid_uuids = read_split(TASK_CATEGORY)
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")

In [None]:
df = x_valid.groupby(["family_id", "age_group", "gender", "personal_id"]).agg(list)[["index"]]

# 인덱스 쌍을 저장할 빈 리스트를 생성합니다.
index_pairs = []

# 각 그룹에 대해 인덱스 쌍을 생성합니다.
for _, group in df.groupby(["family_id", "age_group", "gender"]):
    index_list = group["index"].tolist()

    # 그룹의 인덱스 리스트에 두 개 이상의 원소가 있는 경우에만 조합을 생성합니다.
    if len(index_list) >= 2:
        for candidate in itertools.combinations(index_list, 2):
            for pair in itertools.product(*candidate):
                index_pairs.append([*group.index[0][:3], pair])

# 인덱스 쌍을 출력합니다.
pd.DataFrame(index_pairs, columns=["family_id", "age_group", "gender", "pairs"])

## CASE1: 가족 관계에 있는 얼굴쌍

In [None]:
import os
import numpy as np
import pandas as pd
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR
import itertools

In [None]:
face = join_face_df(DTFR, "aihub_family")

In [None]:
np.random.seed(22)
NUM_FOLDS = 10
NUM_PAIRS = 300

In [None]:
CATEGORY = "CASE1"
TASK_CATEGORY = "test"
target_pair = f"pairs/{TASK_CATEGORY}/pairs_{CATEGORY}.txt"
os.makedirs(os.path.dirname(target_pair), exist_ok=True)
print(target_pair)

In [None]:
valid_uuids = read_split(TASK_CATEGORY)
x_valid = face.loc[valid_uuids]
x_valid = x_valid.reset_index().reset_index().set_index("uuid")

In [None]:
df = x_valid.groupby(["family_id", "personal_id"]).agg(list)[["index"]]

In [None]:
# 각 그룹에 대해 인덱스 쌍을 생성합니다.
total_matched_list = []
for index, value in df.itertuples():
    for candidate in itertools.combinations(value, 2):
        total_matched_list.append([index[0], candidate])
total_matched_pairs = pd.DataFrame(total_matched_list, columns=["family_id", "pairs"])
selected_matched_pairs = total_matched_pairs.sample(
    n=3000, replace=False, random_state=22
)

# 인덱스 쌍을 저장할 빈 리스트를 생성합니다.
total_mismatched_list = []

# 각 그룹에 대해 인덱스 쌍을 생성합니다.
for _, group in df.groupby(["family_id"]):
    index_list = group["index"].tolist()

    # 그룹의 인덱스 리스트에 두 개 이상의 원소가 있는 경우에만 조합을 생성합니다.
    if len(index_list) >= 2:
        for candidate in itertools.combinations(index_list, 2):
            for pair in itertools.product(*candidate):
                total_mismatched_list.append([*group.index[0][:1], pair])

# 인덱스 쌍을 출력합니다.
total_mismatched_pairs = pd.DataFrame(
    total_mismatched_list, columns=["family_id", "pairs"]
)
selected_mismatched_pairs = total_mismatched_pairs.sample(
    n=3000, replace=False, random_state=22
)

In [None]:
dfs_matched = [
    group
    for _, group in selected_matched_pairs.groupby(
        np.arange(len(selected_matched_pairs)) // NUM_PAIRS
    )
]
dfs_mismatched = [
    group
    for _, group in selected_mismatched_pairs.groupby(
        np.arange(len(selected_mismatched_pairs)) // NUM_PAIRS
    )
]

with open(target_pair, "w") as f:
    f.write(f"{NUM_FOLDS} {NUM_PAIRS}\n")
    for df_matched, df_mismatched in zip(dfs_matched, dfs_mismatched):
        for row in df_matched.itertuples():
            idx1, idx2 = row.pairs
            target = x_valid.iloc[idx1].target
            assert target == x_valid.iloc[idx2].target
            name1 = x_valid.iloc[idx1].name
            name2 = x_valid.iloc[idx2].name
            f.write(f"{target:8s}\t{name1}\t{name2}\n")

        for row in df_mismatched.itertuples():
            idx1, idx2 = row.pairs
            target1 = x_valid.iloc[idx1].target
            target2 = x_valid.iloc[idx2].target
            assert x_valid.iloc[idx1].family_id == x_valid.iloc[idx2].family_id
            assert target1 != target2
            name1 = x_valid.iloc[idx1].name
            name2 = x_valid.iloc[idx2].name
            f.write(f"{target1:8s}\t{name1}\t{target2:8s}\t{name2}\n")