In [42]:
import itertools
import math
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import polars as pl
import scipy.stats as stats
import seaborn as sns
from scipy.stats import mannwhitneyu, ttest_ind
from statsmodels.stats.multitest import multipletests

# sns.set_theme(style="whitegrid", palette="gray", font_scale=1.4)
sns.set_theme(style="whitegrid", font_scale=2.0)
# save_dir = Path("~/class/research/master_thesis/figure/sec4").expanduser()
save_dir = Path("~/dev/nextjs/subjective-evaluation-test-2/src_py/figs").expanduser()

In [43]:
df_answer = pl.read_csv("./result/Answers_rows.csv")
df_sample = pl.read_csv("./result/SampleMetaData_rows.csv")
df_sample = df_sample.rename({"id": "sample_meta_data_id"})
df_respondent = pl.read_csv("./result/Respondents_rows.csv")
df_respondent = df_respondent.rename({"id": "respondent_id"})

df_respondent = df_respondent.with_columns(
    pl.col("sex").replace({"男性": "M", "女性": "F", "無回答": "N"}),
    pl.col("audio_device").replace({"イヤホン": "Earphone", "ヘッドホン": "Headphone"}),
)

df_sample = df_sample.with_columns(
    pl.col("model_id").replace(
        {
            0: "(4)",  # 分析合成
            1: "(5)",  # 原音声
            2: "(1)",  # ベースライン
            3: "(2-a)",  # HuBERTなし メル・連続
            4: "(2-b)",  # HuBERTなし メル・離散
            5: "(2-c)",  # HuBERTなし メル・連続・離散
            6: "(3-a)",  # HuBERTあり メル・連続
            7: "(3-c)",  # HuBERTあり メル・連続・離散
            8: "(3-b)",  # HuBERTあり メル・離散
        }
    )
)

df_answer = df_answer.join(
    other=df_sample, on=["sample_meta_data_id"], how="left", coalesce=True
)
df_answer = df_answer.join(
    other=df_respondent,
    on="respondent_id",
    how="left",
    coalesce=True,
)
df_answer = df_answer.filter(
    pl.col("is_dummy").not_()
    & pl.col("is_invalid").not_()
    & (pl.col("sample_page_name") == "eval_1")
)

In [44]:
colname = "intelligibility_id"

data_tmp = {
    "model_id": [],
    colname: [],
}
for model_id in df_answer["model_id"].unique().to_numpy().reshape(-1):
    for intelligibility_id in range(1, 6):
        data_tmp["model_id"].append(model_id)
        data_tmp[colname].append(intelligibility_id)

df_answer_int_len = (
    pl.DataFrame(data=data_tmp)
    .join(
        df_answer.group_by(["model_id", colname]).len().sort(["model_id", colname]),
        on=["model_id", colname],
        how="left",
        coalesce=True,
    )
    .with_columns(pl.col("len").fill_null(0))
    .join(
        df_answer.group_by(["model_id"]).len().rename({"len": "total"}),
        on=["model_id"],
        how="left",
        coalesce=True,
    )
    .with_columns((pl.col("len") / pl.col("total") * 100).alias("pct"))
    .sort(["model_id", colname], descending=False)
)
df_answer_int_len = df_answer_int_len.cast({"intelligibility_id": pl.String})

fig = px.bar(
    df_answer_int_len,
    x="model_id",
    y="pct",
    color="intelligibility_id",
    labels={
        "model_id": "ID",
        "pct": "評価割合 [%]",
        "intelligibility_id": "明瞭性",
    },
    width=800,
    height=400,
)
fig.update_layout(font={"size": 20})
fig.update_layout(
    {
        "plot_bgcolor": "rgba(0, 0, 0, 0)",
        "paper_bgcolor": "rgba(0, 0, 0, 0)",
    }
)

fig.show()
fig.write_image("./figs_2/test.png")