In [None]:
import os
import numpy as np
import pandas as pd
import dataframe_image as dfi
from easydict import EasyDict as edict
from fpt.data import join_face_df
from fpt.split import read_split
from fpt.path import DTFR, DATA

pd.set_option("display.float_format", "{:.4f}".format)

In [None]:
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc

font_path = "/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
fontprop = fm.FontProperties(fname=font_path, size=18)

# 한글 폰트 설정
font_name = fm.FontProperties(
    fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
).get_name()
rc("font", family=font_name)

In [None]:
# Set global figure background color
plt.rcParams["figure.facecolor"] = "white"

sns.set(style="white")

In [None]:
from pathlib import Path

ROOT = Path("/home/jongphago/family-photo-tree")


def savefig(target: str, extension=".png"):
    if not target.endswith(extension):
        target += extension
    image_path = ROOT / target
    dirname = os.path.dirname(image_path)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    plt.savefig(image_path, facecolor="w")

In [None]:
def add_extension(path):
    if os.path.exists(path + ".jpg"):
        return path + ".jpg"
    elif os.path.exists(path + ".png"):
        return path + ".png"
    else:
        raise RuntimeError('No file "%s" with extension png or jpg.' % path)


def get_distance_list(distance_path):
    with open(distance_path, "r") as f:
        l = f.readline().rstrip()
        best_distance = float(l.split(", ")[1])
        lines = f.readlines()
        distances = [float(l.rstrip()) for l in lines]
        assert len(distances) == 6000
    return distances, best_distance


def get_lr(df, prop_name, is_sort=False):
    new = df[[f"{prop_name}_left", f"{prop_name}_right", "distance", "correct"]].copy()
    
    if is_sort:
        sorted_lr = pd.DataFrame(
            np.sort(new[[f"{prop_name}_left", f"{prop_name}_right"]]),
            columns=[f"{prop_name}_left", f"{prop_name}_right"],
        )
        new.update(sorted_lr)

    # apply sorted function on total_list
    new["total_list"] = df[[f"{prop_name}_left", f"{prop_name}_right"]].apply(
        lambda x: sorted(x), axis=1
    )

    new["total"] = new["total_list"].apply(lambda x: ", ".join(map(str, x)))

    # set property
    best_distance = df.best_distance
    pair_name = df.pair_name
    checkpoint = df.checkpoint
    model_type = df.model_type
    setattr(new, "pair_name", pair_name)
    setattr(new, "checkpoint", checkpoint)
    setattr(new, "model_type", model_type)
    setattr(new, "best_distance", best_distance)
    return new


def makedirs(table_name):
    root = "/home/jupyter/family-photo-tree/utils/dataset"
    dir_name = os.path.dirname(table_name)
    export_dir = os.path.join(root, dir_name)
    os.makedirs(export_dir, exist_ok=True)

In [None]:
# face dataframe
face = join_face_df(DTFR, "aihub_family")
face["personal_id"] = face["personal_id"].str.replace("\d+", "", regex=True)
valid_uuids = read_split("test")

x_test = face.loc[valid_uuids]
x_test = x_test[x_test.age_group != "above"]
x_test = x_test.reset_index().reset_index().set_index("uuid")

In [None]:
def get_df(pair_name, model_type, checkpoint, x_test=x_test):
    def get_path(pair_name, model_type, checkpoint):
        distance_path = DATA / f"distance/{model_type}/{checkpoint}/{pair_name}.txt"
        pairs_path = f"pairs/test/pairs_{pair_name}.txt"
        return distance_path, pairs_path

    # get path
    distance_path, pairs_path = get_path(pair_name, model_type, checkpoint)

    # Distance array
    distances, best_distance = get_distance_list(distance_path)
    distances = np.array([distances, np.array(distances) > best_distance], dtype=int).T

    # pairs data path
    os.makedirs(os.path.dirname(pairs_path), exist_ok=True)
    aihub_dir = DATA / "face-image/test_aihub_family"

    # pairs df (6000, 6)
    pairs = []
    is_sames = []
    with open(pairs_path, "r") as f:
        for line in f.readlines()[1:]:
            pair = line.strip().split()
            is_sames.append(True if len(pair) == 3 else False)
            if len(pair) == 3:
                target = pair[0]
                pair.insert(2, target)
            pairs.append(pair)

    pairs = np.array(pairs, dtype=object)
    is_sames = np.array(is_sames, dtype=np.int64)[:, np.newaxis]
    columns = ["issame", "ltarget", "luuid", "rtarget", "ruuid", "distance", "correct"]
    pairs_df = pd.DataFrame(
        np.hstack((is_sames, pairs, distances)),
        columns=columns,
    )  # (6000, 6)

    # path_df
    nrof_skipped_pairs = 0
    path_list = []
    issame_list = []
    for pair in pairs:
        if len(pair) == 3:
            path0 = add_extension(os.path.join(aihub_dir, pair[0], pair[1]))
            path1 = add_extension(os.path.join(aihub_dir, pair[0], pair[2]))
            issame = True
        elif len(pair) == 4:
            path0 = add_extension(os.path.join(aihub_dir, pair[0], pair[1]))
            path1 = add_extension(os.path.join(aihub_dir, pair[2], pair[3]))
            issame = False
        if os.path.exists(path0) and os.path.exists(
            path1
        ):  # Only add the pair if both paths exist
            path_list.append((path0, path1, issame))
            issame_list.append(issame)
        else:
            nrof_skipped_pairs += 1

    if nrof_skipped_pairs > 0:
        print("Skipped %d image pairs" % nrof_skipped_pairs)

    path_df = pd.DataFrame(
        path_list,
        columns=("image_left", "image_right", "issame"),
    )  # (6000, 3)

    # df
    temp_x_test = x_test.reset_index().set_index("index")  # (8147, 15)
    temp_merged = pd.merge(
        pairs_df,
        temp_x_test,
        left_on="luuid",
        right_on="uuid",
    )
    df = pd.merge(
        temp_merged,
        temp_x_test,
        left_on="ruuid",
        right_on="uuid",
        suffixes=["_left", "_right"],
    )

    df.correct = np.where(
        df.issame.to_numpy().astype(bool),
        np.array(df.distance) <= best_distance,
        np.array(df.distance) > best_distance,
    )

    # set property
    setattr(df, "pair_name", pair_name)
    setattr(df, "checkpoint", checkpoint)
    setattr(df, "model_type", model_type)
    setattr(df, "best_distance", best_distance)

    return df

In [None]:
def grouped_ratio(lr, prop_name):
    grouped = lr.groupby("total").count().total_list.to_frame() / len(lr)
    grouped.columns = [f"{prop_name}_pair_ratio"]
    setattr(grouped, "prop_name", prop_name)
    return grouped

In [None]:
# capture dataframe
def capture_df(grouped, pair_name, prop_name):
    grouped_shape = '{}x{}'.format(*grouped.shape)
    image_path = f"export/{pair_name}/{prop_name}/lr-{prop_name}_pair_ratio-{grouped_shape}.png"
    makedirs(image_path)
    dfi.export(grouped, image_path, table_conversion="matplotlib")
    print(image_path)

In [None]:
# model_type, checkpoint = "single-fr-ver-1", "230529_0140"
# model_type, checkpoint = "dual-frkr-ver-1", "230602_2021"
model_type, checkpoint = "triple-fraekr-ver-1", "230601_1838"
pair_names = [
    "BASIC-G", "BASIC-GC",
    "BASIC-A", "BASIC-AC",
    "BASIC-F", "BASIC-FC",
    "FAMILY-A", "FAMILY-CA",
    "FAMILY-G", "FAMILY-CG",
    "FAMILY-AG", "FAMILY-CAG",
    "BASIC-FN",
    "PERSONAL-A", "PERSONAL-AC",
]

In [None]:
model_types = [
    "single-fr-ver-1",
    "dual-frkr-ver-1",
    "triple-fraekr-ver-1",
]
checkpoints = [
    "230529_0140",
    "230602_2021",
    "230601_1838",
]

In [None]:
master = edict()
for model_type, checkpoint in zip(model_types, checkpoints):
    task = model_type.rsplit("-")[0]
    df_dict = edict(
        {
            pair_name.replace("-", "").lower(): get_df(
                pair_name, model_type, checkpoint
            )
            for pair_name in pair_names
        }
    )
    master[task] = df_dict

In [None]:
df_dict = edict(
    {
        pair_name.replace("-", "").lower(): get_df(pair_name, model_type, checkpoint)
        for pair_name in pair_names
    }
)

In [None]:
prop_name_dict = edict({
    'gender':'성별',
    'age_group': '나이 그룹',
    'age': '나이',
    'category': '이미지 카테고리'
})

In [None]:
df_dict = master.triple
# pair_list = [df_dict.basicg, df_dict.basicgc]
# pair_list = [df_dict.basica, df_dict.basicac]
# pair_list = [df_dict.basicf, df_dict.basicfc]
# pair_list = [df_dict.familya, df_dict.familyca]
# pair_list = [df_dict.familyg, df_dict.familycg]
# pair_list = [df_dict.familyag, df_dict.familycag]
# pair_list = [df_dict.personala, df_dict.personalac]

# pair_list = [master.single.basicg, master.dual.basicg, master.triple.basicg]
pair_list = [master.single.personalac, master.dual.personalac, master.triple.personalac]

# task = sub_task.model_type.rsplit('-')[0].upper()
print(f'[PAIRS]\t{", ".join([f"{pair.model_type[:4]}-{pair.pair_name}" for pair in pair_list])}')

## EDA

### Ratio table

#### gender

In [None]:
def draw_data_ratio_table(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]
    result = []
    a = x_test.groupby(prop_name).count().label / len(x_test)
    setattr(a, "name", "x_test")
    result.append(a)
    pair_name_list = []
    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name)
        for d in ["left", "right"]:
            b = lr.groupby(f"{prop_name}_{d}").count().total / len(lr)
            setattr(b, "name", f"{sub_task.model_type.split('-')[0].upper()}-{sub_task.pair_name}-{d}")
            result.append(b)
        pair_name_list.append(sub_task.pair_name)
    print(f'Table | {"와 ".join(pair_name_list)}의 {kor_prop_name} 데이터 구성 비율')
    return pd.DataFrame(result, index=[r.name for r in result])

In [None]:
prop_name = 'gender'
draw_data_ratio_table(prop_name, pair_list)

#### age_group

In [None]:
prop_name = 'age_group'
draw_data_ratio_table(prop_name, pair_list)

#### category

In [None]:
prop_name = 'category'
draw_data_ratio_table(prop_name, pair_list)

### Heatmap

In [None]:
from matplotlib import font_manager
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def draw_pair_ratio_heatmap(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]

    # 한글 폰트 설정
    font_name = font_manager.FontProperties(
        fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
    ).get_name()
    plt.rc("font", family=font_name)

    fig, axs = plt.subplots(nrows=1, ncols=len(pair_list), figsize=(10*len(pair_list), 8))

    # Compute the global min and max to use for all heatmaps
    vmin = float("inf")
    vmax = float("-inf")

    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name, is_sort=True)
        heatmap_data = lr.groupby(
            [f"{prop_name}_left", f"{prop_name}_right"]
        ).count().correct / len(lr)
        vmin = min(vmin, heatmap_data.min())
        vmax = max(vmax, heatmap_data.max())

    pair_name_list = []
    copied = None
    for i, sub_task in enumerate(pair_list):
        lr = get_lr(sub_task, prop_name, is_sort=True)

        # DataFrame을 pivot 형태로 변환
        heatmap_data = lr.groupby(
            [f"{prop_name}_left", f"{prop_name}_right"]
        ).count().correct / len(lr)
        heatmap_data = heatmap_data.reset_index()
        heatmap_data = heatmap_data.pivot(
            f"{prop_name}_left", f"{prop_name}_right", "correct"
        )
        if copied is None:
            copied = heatmap_data.copy()
            first_copied = copied.copy()
        else:
            display(copied - heatmap_data)
            copied = heatmap_data.copy()

        # heatmap 그리기
        sns.heatmap(
            heatmap_data,
            cmap="YlGnBu",
            annot=True,
            fmt=".2f",
            ax=axs[i],
            vmin=vmin,
            vmax=vmax,
        )

        # x, y 축 라벨 및 타이틀 설정
        axs[i].set_title(f"[{lr.pair_name}] {kor_prop_name} 쌍 비율", fontsize=14)
        axs[i].set_xlabel(f"오른쪽 {kor_prop_name}", fontsize=12)
        axs[i].set_ylabel(f"왼쪽 {kor_prop_name}", fontsize=12)

        m = heatmap_data.stack().mean()
        d = heatmap_data.stack().std()
        print(f"평균 {m:.4f}, 표준편차 {d:.4f}")
        pair_name_list.append(sub_task.pair_name)
    
    display(copied - first_copied)
    print(f'Figure | {"와 ".join(pair_name_list)}의 {kor_prop_name} 데이터 쌍 비율')

    plt.tight_layout()
    plt.show()

def draw_instance_ratio_heatmap(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]

    # 한글 폰트 설정
    font_name = font_manager.FontProperties(
        fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
    ).get_name()
    plt.rc("font", family=font_name)

    result = []
    a = x_test.groupby(prop_name).count().label / len(x_test)
    setattr(a, "name", "x_test")
    result.append(a)

    pair_name_list = []
    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name)
        for d in ["left", "right"]:
            b = lr.groupby(f"{prop_name}_{d}").count().total / len(lr)
            setattr(b, "name", f"{sub_task.model_type.split('-')[0].upper()}-{sub_task.pair_name}-{d}")
            result.append(b)
            m, d = b.mean(), b.std()
            print(f"평균 {m:.4f}, 표준편차 {d:.4f}")
        pair_name_list.append(sub_task.pair_name)
    print(f'Figure | {"와 ".join(pair_name_list)}의 {kor_prop_name} 비율 히트맵')

    out = pd.DataFrame(result, index=[r.name for r in result])  # (5, 8)

    # seaborn heatmap 사용
    plt.figure(figsize=(10, 5))
    sns.heatmap(
        out,
        annot=True,
        cmap="YlGnBu",
        fmt=".3g",
        linewidths=0.5,
        cbar_kws={"shrink": 0.5},
    )

    # Setting labels and title
    title = f"[{','.join(pair_name_list)}] {kor_prop_name} 비율 히트맵"
    plt.title(title, fontsize=16)
    plt.xlabel(f"{kor_prop_name}", fontsize=14)
    plt.ylabel("데이터셋", fontsize=14)
    plt.tick_params(axis="y", rotation=0)

    plt.show()

#### Gender

In [None]:
prop_name = "gender"
draw_instance_ratio_heatmap(prop_name, pair_list)

In [None]:
prop_name = "gender"
draw_pair_ratio_heatmap(prop_name, pair_list)

#### age_group

In [None]:
prop_name = "age_group"
draw_instance_ratio_heatmap(prop_name, pair_list)

In [None]:
prop_name = "age_group"
draw_pair_ratio_heatmap(prop_name, pair_list)

#### Cateogry

In [None]:
prop_name = "category"
draw_instance_ratio_heatmap(prop_name, pair_list)

In [None]:
prop_name = "category"
draw_pair_ratio_heatmap(prop_name, pair_list)

### KDE

#### Age diff

In [None]:
def draw_age_diff_kde(pair_list):
    # 한글 폰트 설정
    font_name = font_manager.FontProperties(fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf").get_name()
    plt.rc("font", family=font_name)

    sns.set(style="whitegrid")  # setting seaborn style
    plt.figure(figsize=(20, 6))  # specifying figure size

    # Plotting KDE
    plt.subplot(1, 2, 1)  # subplot to plot two graphs side by side

    pair_name_list = []
    for sub_task in pair_list:
        sub_frame = abs(sub_task.age_left - sub_task.age_right)
        pair_name_list.append(sub_task.pair_name)
        sns.kdeplot(sub_frame, label=sub_task.pair_name, fill=True, alpha=0.1, cut=0)

    plt.xlabel("나이 그룹 평균 나이 차이(절대값)", fontsize=14, fontproperties=fontprop)
    plt.ylabel("밀도", fontsize=14, fontproperties=fontprop)
    plt.xlim(0)
    plt.title(f"[{','.join(pair_name_list)}] 나이 그룹 평균 나이 차이에 대한 커널 밀도 추정", fontsize=16, fontproperties=fontprop)
    plt.legend(title="pair_name", title_fontsize="13", fontsize=12)

    # Plotting Cumulative KDE
    plt.subplot(1, 2, 2)  # subplot to plot two graphs side by side

    for sub_task in pair_list:
        sub_frame = abs(sub_task.age_left - sub_task.age_right)
        sns.kdeplot(sub_frame, cumulative=True, label=sub_task.pair_name, fill=True, alpha=0.1, cut=3)

    plt.xlabel("나이 또는 나이 그룹의 평균 나이 차이(절대값)", fontsize=14, fontproperties=fontprop)
    plt.ylabel("누적 밀도", fontsize=14, fontproperties=fontprop)
    plt.xlim(0)
    plt.title(f"[{','.join(pair_name_list)}] 나이 또는 나이 그룹 평균 나이 차이에 대한 누적 커널 밀도 추정", fontsize=16, fontproperties=fontprop)
    plt.legend(title="pair_name", title_fontsize="13", fontsize=12)

    print(f"Figure | {'와 '.join(pair_name_list)}의 나이 또는 나이 그룹 평균 나이 차이에 대한 누적 커널 밀도 추정")
    # Removing top and right borders
    sns.despine()

    plt.tight_layout()  # for better layout
    plt.show()

In [None]:
draw_age_diff_kde(pair_list)

## Metric

### Accuracy table

In [None]:
def draw_accuracy_table(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]
    pair_name_list = []
    dfs = []  # out DataFrame들을 담을 리스트를 생성합니다.
    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name)
        out = lr.groupby('total')[['correct', 'distance']].mean()
        out['pair_name'] = sub_task.pair_name
        dfs.append(out)  # 리스트에 DataFrame을 추가합니다.
        pair_name_list.append(sub_task.pair_name)
    print(f'Table | {"와 ".join(pair_name_list)}의 {kor_prop_name} 정확도')
    
    # 리스트의 모든 DataFrame들을 합칩니다.
    final_df = pd.concat(dfs)  

    # 통계 행을 생성합니다.
    temp_total = final_df.groupby("pair_name").mean()
    temp_total["total"] = "total"

    # print(f"정확도의 차이: {abs(np.subtract(*[i for i in temp_total.correct])):.4f}")

    # final_df와 통계행을 합칩니다.
    return pd.concat([final_df.reset_index(), temp_total.reset_index()]).groupby(
        ["pair_name", "total"]
    ).sum()

#### Gender

In [None]:
prop_name = "gender"

out = draw_accuracy_table(prop_name, pair_list)
pairs_list, accs_list = [], []
for index, row in out.groupby("pair_name"):
    pairs_list.append(index)
    accs_list.append(row.loc[(index, "total")].correct)
for _p, _a in zip(pairs_list, accs_list):
    print(f"{_p}({_a:.4f})", end=" ")
# print(f"보다 검증 정확도가 {np.subtract(*accs_list):.4f} 낮게 나타났다.")

out

#### Age_group

In [None]:
prop_name = "age_group"
draw_accuracy_table(prop_name, pair_list).sort_values('correct').iloc[:10]

#### Category

In [None]:
prop_name = "category"
draw_accuracy_table(prop_name, pair_list)

### Heatmap

#### Gender

In [None]:
def draw_metric_heatmap(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]

    # 한글 폰트 설정
    font_name = font_manager.FontProperties(fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf").get_name()
    plt.rc("font", family=font_name)

    fig, axs = plt.subplots(nrows=1, ncols=len(pair_list), figsize=(10*len(pair_list), 8))

    # Compute the global min and max to use for all heatmaps
    vmin = float('inf')
    vmax = float('-inf')

    for sub_task in pair_list:
        lr = get_lr(sub_task, prop_name, True)
        heatmap_data = (
            lr.groupby([f"{prop_name}_left", f"{prop_name}_right"]).sum().correct
            / lr.groupby([f"{prop_name}_left", f"{prop_name}_right"]).count().correct
        )
        vmin = min(vmin, heatmap_data.min())
        vmax = max(vmax, heatmap_data.max())

    pair_name_list = []
    copied = None
    for i, sub_task in enumerate(pair_list):
        lr = get_lr(sub_task, prop_name, True)

        # DataFrame을 pivot 형태로 변환
        heatmap_data = (
            lr.groupby([f"{prop_name}_left", f"{prop_name}_right"]).sum().correct
            / lr.groupby([f"{prop_name}_left", f"{prop_name}_right"]).count().correct
        )
        heatmap_data = heatmap_data.reset_index()
        heatmap_data = heatmap_data.pivot(f"{prop_name}_left", f"{prop_name}_right", "correct")
        if copied is None:
            copied = heatmap_data.copy()
            first_copied = copied.copy()
        else:
            display(copied - heatmap_data)
            copied = heatmap_data.copy()
            
        # heatmap 그리기
        sns.heatmap(heatmap_data, cmap="coolwarm", annot=True, fmt=".2f", ax=axs[i], vmin=vmin, vmax=vmax)
        # x, y 축 라벨 및 타이틀 설정
        axs[i].set_title(f"[{lr.model_type.split('-')[0].upper()} | {lr.pair_name}] {kor_prop_name}간 검증 정확도", fontsize=14)
        axs[i].set_xlabel(f"오른쪽 {kor_prop_name}", fontsize=12)
        axs[i].set_ylabel(f"왼쪽 {kor_prop_name}", fontsize=12)
        
        pair_name_list.append(f"{sub_task.model_type.split('-')[0].upper()}-{sub_task.pair_name}")
    display(copied - first_copied)
    print(f'Figure | {"와 ".join(pair_name_list)}의 {kor_prop_name} 검증 정확도')

    plt.tight_layout()
    plt.show()

In [None]:
prop_name = "gender"
draw_metric_heatmap(prop_name, pair_list)

#### Age group

In [None]:
prop_name = "age_group"
draw_metric_heatmap(prop_name, pair_list)

#### Category

In [None]:
prop_name = "category"
draw_metric_heatmap(prop_name, pair_list)

In [None]:
from matplotlib import font_manager
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def draw_metric_kde(prop_name, pair_list):
    kor_prop_name = prop_name_dict[prop_name]

    # 한글 폰트 설정
    font_name = font_manager.FontProperties(fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf").get_name()
    plt.rc("font", family=font_name)

    sns.set(style="whitegrid")  # setting seaborn style
    plt.figure(figsize=(20, 6))  # specifying figure size

    for index, sub_task in enumerate(pair_list):
        # Subplot 1 for lr1
        plt.subplot(1, len(pair_list), index+1)
        pair_name_list = []
        # sub_task = pair_list[0]  # Change this to choose the task
        lr = get_lr(sub_task, prop_name)
        pair_name_list.append(sub_task.pair_name)    
        for tag in lr.total.unique():
            sub_frame = lr[lr.total == tag].distance - lr.best_distance
            sns.kdeplot(sub_frame, label=tag, fill=True, alpha=0.1)

        task = sub_task.model_type.rsplit('-')[0].upper()
        plt.xlabel("거리 차이", fontsize=14, fontproperties=fontprop)
        plt.ylabel("밀도", fontsize=14, fontproperties=fontprop)
        plt.title(f"[{task} | {','.join(pair_name_list)}] {kor_prop_name}에 따른 거리 차이의 커널 밀도 추정", fontsize=16, fontproperties=fontprop)
        plt.legend(title=prop_name.capitalize(), title_fontsize="13", fontsize=12)

    print(f'Figure | {"와 ".join(pair_name_list)}의 {kor_prop_name}에 따른 거리 차이의 커널 밀도 추정')

    # Removing top and right borders
    sns.despine()

    plt.xlim(-15, 15)
    plt.tight_layout()  # for better layout
    plt.show()


### KDE

#### Gender

In [None]:
prop_name = "gender"
draw_metric_kde(prop_name, pair_list)

#### Category

In [None]:
prop_name = "category"
draw_metric_kde(prop_name, pair_list)

#### Age_group

In [None]:
def draw_metric_single_kde(prop_name, pair_list):
    sns.set(style="whitegrid")  # setting seaborn style
    plt.figure(figsize=(10, 6))  # specifying figure size

    pair_name_list = []
    for sub_task in pair_list:
        task = sub_task.model_type.rsplit('-')[0].upper()
        lr = get_lr(sub_task, prop_name)
        pair_name_list.append(f"{task} | {sub_task.pair_name}")    
        for tag in lr.total.unique():
            sub_frame = lr[lr.total == tag].distance - lr.best_distance
            sns.kdeplot(sub_frame, label=f"[{task} | {sub_task.pair_name}] {tag}", fill=True, alpha=0.1)
            break

    # Setting labels and title
    plt.xlabel("거리 차이", fontsize=14, fontproperties=fontprop)
    plt.ylabel("밀도", fontsize=14, fontproperties=fontprop)
    plt.title(f"[{', '.join(pair_name_list)}] 거리 차이의 커널 밀도 추정", fontsize=16, fontproperties=fontprop)
    plt.legend(title=prop_name.capitalize(), title_fontsize="13", fontsize=12)
    plt.xlim(-15, 15)

    # Removing top and right borders
    sns.despine()

    plt.show()


In [None]:
prop_name = "gender"
draw_metric_single_kde(prop_name, pair_list)

### Scatter

#### Age

In [None]:
from matplotlib.colors import LogNorm

In [None]:
def draw_metric_scatter(prop_name, pair_list):
    # 한글 폰트 설정
    font_name = font_manager.FontProperties(
        fname="/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
    ).get_name()
    plt.rc("font", family=font_name)

    fig, axs = plt.subplots(
        nrows=1, ncols=len(pair_list), figsize=(10 * len(pair_list), 8)
    )

    pair_name_list = []
    for i, sub_task in enumerate(pair_list):
        task = sub_task.model_type.rsplit('-')[0].upper()
        lr = get_lr(sub_task, prop_name)

        # Scatter plot
        temp = pd.concat([lr, abs(lr.age_left - lr.age_right)], axis=1)
        temp.columns = temp.columns[:-1].append(pd.Index(["age_diff"]))
        selected = pd.DataFrame(
            temp.groupby("age_diff").sum().correct
            / temp.groupby("age_diff").count().correct
        ).sort_values(by="correct")

        # cmap과 norm을 설정하여 데이터의 수에 따라 색상을 조정합니다.
        cmap = plt.cm.get_cmap("PRGn")
        norm = LogNorm(
            vmin=temp.groupby("age_diff").count().correct.min(),
            vmax=temp.groupby("age_diff").count().correct.max(),
        )

        scatter = axs[i].scatter(
            selected.index,
            selected.correct,
            c=temp.groupby("age_diff").count().correct,
            cmap=cmap,
            norm=norm,
            alpha=0.7,
        )

        # colorbar 추가
        cbar = fig.colorbar(scatter, ax=axs[i])
        cbar.set_label("데이터 수", rotation=270, labelpad=15, fontsize=10)

        # x, y 축 라벨 및 타이틀 설정
        axs[i].set_xlabel("나이 또는 나이 그룹의 평균 나이 차이(절대값)", fontsize=12)
        axs[i].set_ylabel("검증 정확도", fontsize=12)
        axs[i].set_title(f"[{task} | {sub_task.pair_name}] 나이 차이에 따른 검증 정확도", fontsize=14)
        axs[i].set_xlim(0)
        axs[i].set_ylim(0.5)
        pair_name_list.append(sub_task.pair_name)

    print(f'Figure | {"와 ".join(pair_name_list)}의 나이 차이에 따른 검증 정확도')
    plt.tight_layout()
    plt.show()

In [None]:
prop_name = "age"
draw_metric_scatter(prop_name, pair_list)