In [1]:
import os
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

from tfsplt_utils import load_pickle

In [2]:
def tsne_draw_plotly(df, x, y, color, color_label, title):
    colors_distinct = [
        "#000000",
        "#00FF00",
        "#0000FF",
        "#FF0000",
        "#01FFFE",
        "#FFA6FE",
        "#FFDB66",
        "#006401",
        "#010067",
        "#95003A",
        "#007DB5",
        "#FF00F6",
        "#FFEEE8",
        "#774D00",
        "#90FB92",
        "#0076FF",
        "#D5FF00",
        "#FF937E",
        "#6A826C",
        "#FF029D",
        "#FE8900",
        "#7A4782",
        "#7E2DD2",
        "#85A900",
        "#FF0056",
        "#A42400",
        "#00AE7E",
        "#683D3B",
        "#BDC6FF",
        "#263400",
        "#BDD393",
        "#00B917",
        "#9E008E",
        "#001544",
        "#C28C9F",
        "#FF74A3",
        "#01D0FF",
        "#004754",
        "#E56FFE",
        "#788231",
        "#0E4CA1",
        "#91D0CB",
        "#BE9970",
        "#968AE8",
        "#BB8800",
        "#43002C",
        "#DEFF74",
        "#00FFC6",
        "#FFE502",
        "#620E00",
        "#008F9C",
        "#98FF52",
        "#7544B1",
        "#B500FF",
        "#00FF78",
        "#FF6E41",
        "#005F39",
        "#6B6882",
        "#5FAD4E",
        "#A75740",
        "#A5FFD2",
        "#FFB167",
        "#009BFF",
        "#E85EBE",
    ]
    fig = px.scatter(
        df,
        x=x,
        y=y,
        color=df[color],
        labels={"color": color_label},
        hover_data=[
            "word",
            "word_index",
            "pho",
            "pho_idx",
            "place_artic",
            "manner_artic",
            "voice",
            "function_content",
            "part_of_speech",
        ],
        color_discrete_sequence=colors_distinct,
        title=title,
    )
    return fig

def tsne_draw(df, x, y, color, title):
    colors_distinct = [
        "#000000",
        "#00FF00",
        "#0000FF",
        "#FF0000",
        "#01FFFE",
        "#FFA6FE",
        "#774D00",
        "#006401",
        "#010067",
        "#95003A",
        "#007DB5",
        "#FF00F6",
        "#FFEEE8",
        "#FFDB66",
        "#90FB92",
        "#0076FF",
        "#D5FF00",
        "#FF937E",
        "#6A826C",
        "#FF029D",
        "#FE8900",
        "#7A4782",
        "#7E2DD2",
        "#85A900",
        "#FF0056",
        "#A42400",
        "#00AE7E",
        "#683D3B",
        "#BDC6FF",
        "#263400",
        "#BDD393",
        "#00B917",
        "#9E008E",
        "#001544",
        "#C28C9F",
        "#FF74A3",
        "#01D0FF",
        "#004754",
        "#E56FFE",
        "#788231",
        "#0E4CA1",
        "#91D0CB",
        "#BE9970",
        "#968AE8",
        "#BB8800",
        "#43002C",
        "#DEFF74",
        "#00FFC6",
        "#FFE502",
        "#620E00",
        "#008F9C",
        "#98FF52",
        "#7544B1",
        "#B500FF",
        "#00FF78",
        "#FF6E41",
        "#005F39",
        "#6B6882",
        "#5FAD4E",
        "#A75740",
        "#A5FFD2",
        "#FFB167",
        "#009BFF",
        "#E85EBE",
    ]
    df2 = df.copy()
    g = df.groupby(df[color])
    df2 = g.filter(lambda x: len(x) >= 100)
    df2["freq"] = df2.groupby(df2[color])[color].transform("count")
    df2.sort_values("freq",inplace=True,ascending=False)
    plt.style.use("/scratch/gpfs/ln1144/247-plotting/scripts/paper.mlpstyle")
    sns.scatterplot(data=df2,x=df2[x],y=df2[y],hue=df2[color],palette=colors_distinct[0:len(df2[color].unique())], linewidth=0,style=df2["marker"],s=5, markers=["o"])
    plt.title(f"{title}")
    # plt.show()
    plt.savefig(f"../results/20230612-whisper-tsne-no-filter/{title}.svg")
    plt.close()
    return

In [3]:
emb_type = "1st"
emb_type = "ave"

layer = ""

if layer == "":
    layer_en = "4"
    layer_de = "3"
else:
    layer_en = layer
    layer_de = layer

# df = load_pickle(f"/scratch/gpfs/kw1166/247-plotting/results/20230607-whisper-tsne/all4-whisper-tsne-ave{layer}.pkl")
df = load_pickle(f"/scratch/gpfs/kw1166/247-plotting/results/20230612-whisper-tsne-no-filter/all4-whisper-tsne-pca-ave{layer}.pkl")
# df = load_pickle(f"/scratch/gpfs/kw1166/247-plotting/results/20230613-whisper-medium-podcast/777-whisper-tsne-all{layer}.pkl")
df["marker"] = 1

Loading /scratch/gpfs/kw1166/247-plotting/results/20230612-whisper-tsne-no-filter/all4-whisper-tsne-pca-ave.pkl


In [None]:
def filter_df(df):
    print(len(df))
    g = df.groupby(df.pho)
    df = g.filter(lambda x: len(x) >= 100)
    g = df.groupby(df.part_of_speech)
    df = g.filter(lambda x: len(x) >= 100)
    print(len(df))
    return df

df = filter_df(df)


Plotly in HTML

In [None]:
plot_dict = {
    "pho": "phoneme",
    "place_artic": "place_of_articulation",
    "manner_artic": "manner_of_articulation",
    "part_of_speech": "part_of_speech",
    # "voice": "voice_or_voiceless",
    # "function_content": "function_or_content",
}

for plots in plot_dict.keys():
    fig1 = tsne_draw_plotly(df,"en_x","en_y",plots,plot_dict[plots],f"speech-{plot_dict[plots]}")
    fig2 = tsne_draw_plotly(df,"de_x","de_y",plots,plot_dict[plots],f"language-{plot_dict[plots]}")
    with open('tsnes_filtered.html', 'a') as f:
        f.write(fig1.to_html(full_html=False, include_plotlyjs='cdn'))
        f.write(fig2.to_html(full_html=False, include_plotlyjs='cdn'))

Matplotlib

In [4]:
plot_dict = {
    "pho": "phoneme",
    "place_artic": "place_of_articulation",
    "manner_artic": "manner_of_articulation",
    "part_of_speech": "part_of_speech",
    # "voice": "voice_or_voiceless",
    # "function_content": "function_or_content",
}

for plots in plot_dict.keys():
    tsne_draw(df, "en_x", "en_y", plots, f"speech{layer}-{plot_dict[plots]}")
    tsne_draw(df, "de_x", "de_y", plots, f"language{layer}-{plot_dict[plots]}")
    print(len(df))

13347
13347
13347
13347


Classifier Bar Plot and T-test

In [None]:
df = pd.read_csv("../results/20230612-whisper-tsne-no-filter/classifier_pca50_filter-sep_ave_L.csv")
groups = [
    "speech",
    "language",
    "control",
    "uniform",
    "strat",
] * 4
groups_needed = ["speech", "language", "control"]
color = ["red", "blue", "grey", "black", "brown"] * 4
cats = ["Phoneme", "PoA", "MoA", "PoS"]
cats_dup = np.repeat(["Phoneme", "PoA", "MoA", "PoS"], len(groups) / 4)
df = df.iloc[:,1:12]
df = df.assign(groups = groups)
df = df.assign(cats = cats_dup)
df = df[df.groups.isin(groups_needed)]

for cat in cats:
    print(cat)
    df_test = df[df.cats == cat]
    speech_acc = df_test.iloc[0,0:9]
    lang_acc = df_test.iloc[1,0:9]
    control_acc = df_test.iloc[2,0:9]
    control_mean = df_test.iloc[2,10]
    print("Speech vs Language", stats.ttest_ind(speech_acc, lang_acc))
    print("Speech vs Control", stats.ttest_ind(speech_acc, control_acc))
    print("Language vs Control", stats.ttest_ind(lang_acc, control_acc))
    print("Speech 1 sample", stats.ttest_1samp(speech_acc, popmean=control_mean))
    print("Language 1 sample", stats.ttest_1samp(lang_acc, popmean=control_mean))