In [None]:
import os
import glob
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib.backends.backend_pdf import PdfPages

## VAD Summary

In [None]:
vad_df = pd.read_csv("../summary_vad.csv")
vad_df["vad_percent"] = vad_df.vad / vad_df.audio_len
vad_df["osd_percent"] = vad_df.osd / vad_df.audio_len
vad_df["sid"] = vad_df.sid.astype(str)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sns.set_style("whitegrid")
sns.scatterplot(data=vad_df, x="vad_percent", y="osd_percent", size="sid", sizes=(70,70), hue="sid", palette="husl")
plt.savefig("../vad_vs_osd.png")

In [None]:
vad_all_df = pd.read_csv("../all_vad.csv")
vad_all_df["sid"] = vad_all_df.sid.astype(str)
osd_all_df = vad_all_df.loc[vad_all_df.speaker == "OVERLAP", :]
vad_all_df = vad_all_df.loc[vad_all_df.speaker == "SPEECH", :]

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sns.set_style("whitegrid")
plt.yscale('log')
plt.xscale('log')
sns.histplot(data = osd_all_df.loc[osd_all_df.sid == "798",:], x="duration", hue="sid", palette="husl",bins=100)
plt.savefig("../798_osd.png")

## Diarization

In [None]:
dia_df = pd.read_csv("../summary_dia.csv")
dia_df["sid"] = dia_df.sid.astype(str)
dia_df

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sns.set_style("whitegrid")
sns.scatterplot(data = dia_df, x="audio_len", y="utt_num", size="sid", sizes=(70,70), hue="sid", palette="husl")
plt.savefig("../speaker_utt_num.png")

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sns.set_style("whitegrid")
sns.scatterplot(data = dia_df, x="audio_len", y="speaker_len", size="sid", sizes=(70,70), hue="sid", palette="husl")
plt.savefig("../speaker_utt_len.png")

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.xscale('log')
sns.set_style("whitegrid")
sns.histplot(data = dia_df, x="speaker_len", hue="sid", palette="husl",bins=100)
plt.savefig("../utt_len_hist.png")

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.xscale('log')
sns.set_style("whitegrid")
sns.histplot(data = dia_df, x="utt_num", hue="sid", palette="husl",bins=100)
plt.savefig("../utt_num_hist.png")

## Speaker Embeddings

In [None]:
speaker_df = pd.read_csv("../summary_speaker.csv")

def process_emb(embs):
    embs_new = []
    for emb in embs:
        emb = emb.split()
        emb = [float(em.replace("[","").replace("]","").replace(",","")) for em in emb]
        embs_new.append(emb)
    return embs_new

speaker_df["emb"] = process_emb(speaker_df.emb.tolist())

speaker_625 = speaker_df.loc[speaker_df.sid == 625,:]
speaker_676 = speaker_df.loc[speaker_df.sid == 676,:]
speaker_798 = speaker_df.loc[speaker_df.sid == 798,:]
speaker_7170 = speaker_df.loc[speaker_df.sid == 7170,:]

In [None]:
from sklearn.manifold import TSNE
def do_tsne(df, col):
    print(f"Doing t-SNE on {col}")
    tsne = TSNE(n_components=2, perplexity=50, random_state=329)  # HACK
    embs = pd.DataFrame(np.vstack(df[col]))
    projections = pd.DataFrame(tsne.fit_transform(embs))
    return projections

tsne_speaker = do_tsne(speaker_798, "emb")
speaker_798.reset_index(drop=True,inplace=True)
speaker_798["tsne_x"] = tsne_speaker[0]
speaker_798["tsne_y"] = tsne_speaker[1]
speaker_798.to_csv("../speaker_798.csv",index=False)

In [None]:
def plot_tsne(df, x, y, freq, color, title):
    colors_distinct = [
        "#000000",
        "#00FF00",
        "#0000FF",
        "#FF0000",
        "#01FFFE",
        "#FFA6FE",
        "#774D00",
        "#006401",
        "#010067",
        "#95003A",
        "#007DB5",
        "#FF00F6",
        "#FFEEE8",
        "#FFDB66",
        "#90FB92",
        "#0076FF",
        "#D5FF00",
        "#FF937E",
        "#6A826C",
        "#FF029D",
        "#FE8900",
        "#7A4782",
        "#7E2DD2",
        "#85A900",
        "#FF0056",
        "#A42400",
        "#00AE7E",
        "#683D3B",
        "#BDC6FF",
        "#263400",
        "#BDD393",
        "#00B917",
        "#9E008E",
        "#001544",
        "#C28C9F",
        "#FF74A3",
        "#01D0FF",
        "#004754",
        "#E56FFE",
        "#788231",
        "#0E4CA1",
        "#91D0CB",
        "#BE9970",
        "#968AE8",
        "#BB8800",
        "#43002C",
        "#DEFF74",
        "#00FFC6",
        "#FFE502",
        "#620E00",
        "#008F9C",
        "#98FF52",
        "#7544B1",
        "#B500FF",
        "#00FF78",
        "#FF6E41",
        "#005F39",
        "#6B6882",
        "#5FAD4E",
        "#A75740",
        "#A5FFD2",
        "#FFB167",
        "#009BFF",
        "#E85EBE",
    ]
    df2 = df.copy()
    g = df.groupby(df[color])
    df2 = g.filter(lambda x: len(x) >= freq)
    df2["freq"] = df2.groupby(df2[color])[color].transform("count")
    df2.sort_values("freq", inplace=True, ascending=False)
    df2["marker"] = 1
    # plt.style.use("/scratch/gpfs/ln1144/247-plotting/scripts/paper.mlpstyle")
    sns.scatterplot(
        data=df2,
        x=df2[x],
        y=df2[y],
        hue=df2[color],
        palette=colors_distinct[0 : len(df2[color].unique())],
        linewidth=0,
        style=df2["marker"],
        s=10,
        markers=["o"],
    )
    plt.title(f"{title}")
    # plt.show()
    plt.savefig(f"{title}.svg")
    plt.close()
    return

plot_tsne(speaker_625, "tsne_x", "tsne_y", 1, "conv_idx", "../results/625_tsne")
plot_tsne(speaker_676, "tsne_x", "tsne_y", 1, "conv_idx", "../results/676_tsne")
plot_tsne(speaker_7170, "tsne_x", "tsne_y", 1, "conv_idx", "../results/7170_tsne")
plot_tsne(speaker_798, "tsne_x", "tsne_y", 1, "conv_idx", "../results/798_tsne")

In [None]:
speaker_625_2 = speaker_625.loc[speaker_625.conv_idx == 2,:]
speaker_625_5 = speaker_625.loc[speaker_625.conv_idx == 5,:]
speaker_625_6 = speaker_625.loc[speaker_625.conv_idx == 6,:]
speaker_625_10 = speaker_625.loc[speaker_625.conv_idx == 10,:]
speaker_625_51 = speaker_625.loc[speaker_625.conv_idx == 51,:]

emb2 = speaker_625_2.emb.tolist()
emb5 = speaker_625_5.emb.tolist()
emb6 = speaker_625_6.emb.tolist()
emb10 = speaker_625_10.emb.tolist()
emb51 = speaker_625_51.emb.tolist()

In [None]:
cmap = "viridis_r"

fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(cosine_similarity(emb2, emb2), cmap=cmap)
plt.savefig("conv2 vs conv2.png")
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(cosine_similarity(emb5, emb5), cmap=cmap)
plt.savefig("conv5 vs conv5.png")
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(cosine_similarity(emb6, emb6), cmap=cmap)
plt.savefig("conv6 vs conv6.png")
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(cosine_similarity(emb10, emb10), cmap=cmap)
plt.savefig("conv5 vs conv5.png")
sns.heatmap(cosine_similarity(emb2, emb6), cmap=cmap)
plt.savefig("conv2 vs conv6.png")
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(cosine_similarity(emb5, emb10), cmap=cmap)
plt.savefig("conv5 vs conv10.png")
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(cosine_similarity(emb2, emb10), cmap=cmap)
plt.savefig("conv2 vs conv10.png")


## Data Loading

In [None]:
sids = [625,676,7170,798]
model = "large-v2-x"

def load_results(sids, model, data_dir):
    df = pd.DataFrame()
    for sid in sids:
        sid_file = os.path.join(data_dir, f"{sid}-{model}.csv")
        sid_df = pd.read_csv(sid_file)
        sid_df["sid"] = sid
        df = pd.concat((df,sid_df))
    return df

data_dir = "../data/preprocessing/20231219-second/"
df1 = load_results(sids, model, data_dir)
data_dir = "../data/preprocessing/20240115-third"
df = load_results(sids, model, data_dir)

In [None]:
for sid in sids:
    df3 = df1[df1["sid"] == sid]
    df4 = df2[df2["sid"] == sid]
    df3 = df3[df3["wer"] <= 2]
    df4 = df4[df4["wer"] <= 2]
    print(len(df3), len(df4))
    sns.set_style('whitegrid')
    # fig, ax = plt.subplots(1,1) # histogram
    # sns.histplot(
    #     df3, x="wer", alpha=0.7, bins=20, binrange=[0,2]
    # )
    # # sns.histplot(
    # #     df4, x="wer", alpha=0.5
    # # )
    # plt.savefig(f"{sid}.jpeg")

    # fig, ax = plt.subplots(1,1) # scatter
    # ax.plot([0, 1], [0, 1], transform=ax.transAxes)
    # plt.scatter(df3["wer"], df4["wer"], s=5, marker="o")
    # ax.set_ylim(0,10)
    # ax.set_xlim(0,10)
    # ax.set_xlabel("Original")
    # ax.set_ylabel("Filter Inaud")
    # plt.savefig(f"{sid}.jpeg")

    fig, ax = plt.subplots(1,1) # scatter
    # ax.plot([0, 1], [0, 1], transform=ax.transAxes)
    plt.scatter(df2["score_75"], df2["wer"], s=5)
    ax.set_ylim(0,2)
    ax.set_xlim(0.5,1)
    ax.set_xlabel("Accuracy_75")
    ax.set_ylabel("WER")
    plt.savefig(f"{sid}.jpeg")

In [None]:
fig, axes = plt.subplots(4,1, figsize=(20,30))

sns.boxplot(data=df1,x="sid",y="gt_word_num",ax=axes[0])
sns.boxplot(data=df1,x="sid",y="pr_word_num",ax=axes[1])
sns.boxplot(data=df1,x="sid",y="gt_speaker",ax=axes[2])
sns.boxplot(data=df1,x="sid",y="pr_speaker",ax=axes[3])


In [None]:
df2

## Compare with Human Transcript

For whisper paper

In [None]:
paper_results = "../data/preprocessing/paper_results.csv"
paper_df = pd.read_csv(paper_results)

In [None]:
print(paper_df.hu_wer.describe())
print(paper_df.wx_wer.describe())

In [None]:
plt.close()
fig, ax = plt.subplots(1,1)
sns.set_style('whitegrid')
sns.histplot(
    paper_df[paper_df.hu_wer <= 2], x="hu_wer"
)
sns.histplot(
    paper_df[paper_df.hu_wer <= 2], x="wx_wer"
)
# sns.histplot(
#     paper_df[paper_df.hu_wer <= 2], x="huwx_wer"
# )
plt.show()
# plt.savefig(f"{sid}.jpeg")

## Select Chunk Pilot

In [None]:
import whisper
import scipy.io.wavfile as wavfile

df_chunk = df[df.gt_word_num >= 800].copy()
df_chunk.sort_values(by=["wer"],ascending=True,inplace=True)
# df_chunk.groupby(df_chunk.sid).first()
selected = df_chunk.groupby(df_chunk.sid).first()
selected.reset_index(inplace=True)

## Select Chunk Mturk

In [None]:
# df_chunk = df[df.chunk == "(0.0, 300.0]"]
# df_chunk = df_chunk[(df_chunk.gt_word_num >= 207) & (df_chunk.gt_word_num <= 622)]
# df_chunk = df_chunk[df_chunk.sid != 798]
# df_chunk = df_chunk.groupby("sid").apply(lambda x: x.sample(n=6)).reset_index(drop = True)
# df_chunk.wer.describe()

In [None]:
df_chunk = pd.DataFrame()
for sid in [625,676,7170,798]:
    df_798 = df[df.sid==sid]
    df_798 = df_798.groupby("conversation").first().reset_index()
    df_798 = df_798.sample(n=15)
    transcript_dir = f"/scratch/gpfs/kw1166/whisper-transcribe/data/tfs/{sid}/"
    starts = []
    for conversation in df_798.conversation.tolist():
        trans_file = glob.glob(os.path.join(transcript_dir,f"{conversation}*"))
        conv_df = pd.read_csv(
            trans_file[0],
            sep=" ",
            header=None,
            names=["word", "onset", "offset", "accuracy", "speaker"],
        )
        conv_df["start"] = (conv_df.onset + 3000) / 512
        conv_df["utt_start"] = conv_df.speaker.ne(conv_df.speaker.shift())
        print(conversation)
        print(len(conv_df))
        conv_df = conv_df[conv_df.utt_start]
        conv_df = conv_df[conv_df.speaker == "Speaker1"]
        try:
            starts.append(math.floor(conv_df["start"].iloc[0]))
        except:
            starts.append(-1)
    starts = [f"({start},3000]" for start in starts]
    df_798.chunk = starts
    df_chunk = pd.concat((df_chunk, df_798))

In [None]:
df_chunk = df_chunk[~df_chunk.chunk.str.contains("(-1)")]

In [None]:
# selected = pd.concat((df_chunk, df_798))
selected = df_chunk
selected.to_csv("../mturk_chunk2.csv",index=False)

In [None]:
def load_audio(filepath):
    fs, audio = wavfile.read(filepath)
    print(f"Sampling rate: {fs}")
    print(f"Audio Length (s): {len(audio) / fs}")
    return fs, audio

In [None]:
conv_dir = "/projects/HASSON/247/data/conversations-car/"

conversations = selected.conversation.tolist()
chunks = selected.chunk.tolist()
sids = selected.sid.tolist()
for sid, conv, chunk in zip(sids, conversations, chunks):
    chunk_onset = float(chunk[chunk.find("(")+1:chunk.find(",")])
    chunk_offset = float(chunk[chunk.find(",")+2:chunk.find("]")])

    print(f"{sid} {conv} {chunk_onset} s to {chunk_offset} s")
    audio_path = os.path.join(conv_dir, str(sid), conv, "audio", f"{conv}_deid.wav")
    
    # With whisper fs
    # audio = whisper.load_audio(audio_path)
    # sampling_rate = 16000
    # chunk_data = audio[
    #     int(chunk_onset * sampling_rate) : int((chunk_onset + 30) * sampling_rate)
    # ]
    # new_audio = whisper.pad_or_trim(chunk_data)
    # chunk_name = f"../{sid}_sample_30s_wfs.wav"
    # wavfile.write(chunk_name, sampling_rate, chunk_data)

    # With original fs
    sampling_rate, full_audio = load_audio(audio_path)
    chunk_data2 = full_audio[
        int(chunk_onset * sampling_rate) : int((chunk_onset + 30) * sampling_rate)
    ]
    chunk_name = f"../{sid}_{conv}.wav"
    wavfile.write(chunk_name, sampling_rate, chunk_data2)

## Plot mturk results

In [None]:
datafile = "/projects/HASSON/247/data/mturk/2024-02-09-wer-results/Alltask_eval.csv"
mturk_df = pd.read_csv(datafile)
first_df = mturk_df[mturk_df.attempt == "first"].reset_index()
second_df = mturk_df[mturk_df.attempt == "second"].reset_index()
pdf = PdfPages("../mturk_results.pdf")

### Histograms

In [None]:
df_plot_whisper = pd.DataFrame({"WER":second_df.whisper, "type":"whisper"})
df_plot_hu1 = pd.DataFrame({"WER":second_df.hu1, "type":"human 1"})
df_plot_hu2 = pd.DataFrame({"WER":second_df.hu2, "type":"human 2"})
df_plot = pd.concat((df_plot_whisper,df_plot_hu1,df_plot_hu2))
df_plot

fig, ax = plt.subplots()
sns.histplot(df_plot, x="WER",hue="type", palette="hls", bins=20, ec=None, ax=ax)
pdf.savefig(fig)
plt.close()
fig, ax = plt.subplots()
sns.histplot(df_plot, x="WER",hue="type", palette="hls", bins=50, ec=None, ax=ax)
pdf.savefig(fig)
plt.close()

### Scatter plots

In [None]:
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], transform=ax.transAxes)
plt.scatter(second_df["whisper"], second_df["hu1"], s=10)
ax.set_title("Whisper vs Hu1")
ax.set_xlabel("Whisper WER")
ax.set_ylabel("Human 2 WER")
pdf.savefig(fig)
plt.close()

fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], transform=ax.transAxes)
plt.scatter(second_df["whisper"], second_df["hu2"], s=10)
ax.set_title("Whisper vs Hu2")
ax.set_xlabel("Whisper WER")
ax.set_ylabel("Human 2 WER")
pdf.savefig(fig)
plt.close()

In [None]:
df_plot_hu1 = pd.DataFrame({"WER1":first_df.hu1,"WER2":second_df.hu1, "certainty1":first_df.hu1_acc, "certainty2":second_df.hu1_acc, "comp1":first_df.hu1_comp, "comp2":second_df.hu1_comp})
df_plot_hu2 = pd.DataFrame({"WER1":first_df.hu2,"WER2":second_df.hu2, "certainty1":first_df.hu2_acc, "certainty2":second_df.hu2_acc, "comp1":first_df.hu2_comp, "comp2":second_df.hu2_comp})
df_plot = pd.concat((df_plot_hu1,df_plot_hu2))

fig, ax = plt.subplots()
plt.scatter(df_plot["certainty2"], df_plot["WER2"], s=10)
ax.set_title("WER vs certainty")
ax.set_xlabel("Certainty")
ax.set_ylabel("Human WER")
pdf.savefig(fig)
plt.close()

fig, ax = plt.subplots()
plt.scatter(df_plot["comp2"], df_plot["WER2"], s=10)
ax.set_title("WER vs comprehension")
ax.set_xlabel("Comprehension")
ax.set_ylabel("Human WER")
pdf.savefig(fig)
plt.close()

In [None]:
fig, ax = plt.subplots()
plt.scatter(df_plot["certainty1"]-df_plot["certainty2"], df_plot["WER1"]-df_plot["WER2"], s=10)
ax.set_title("WER diff vs certainty diff")
ax.set_xlabel("Certainty decrease over two trials")
ax.set_ylabel("Human WER decrease over two trials")
pdf.savefig(fig)
plt.close()

fig, ax = plt.subplots()
plt.scatter(df_plot["comp1"]-df_plot["comp2"], df_plot["WER1"]-df_plot["WER2"], s=10)
ax.set_title("WER diff vs comprehension diff")
ax.set_xlabel("Comprehension decrease over two trials")
ax.set_ylabel("Human WER decrease over two trials")
pdf.savefig(fig)
plt.close()

In [None]:
pdf.close()