# Analyse von Wort-Embeddings

- Lade Wortlisten ein
- Berechne für jedes Wort das zugehörige Embedding


In [1]:
import pprint#
import os

from dotenv import load_dotenv
from openai import AzureOpenAI

from data.word_collection import statistik_woerter, andere_woerter
from text_analysis.embeddings import get_embedding

load_dotenv()

client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
)
model_name = os.getenv("AZURE_DEPLOYMENT_NAME", "")

pprint.pp(statistik_woerter, compact=True)

pprint.pp(andere_woerter, compact=True)

statistik_embeddings = [get_embedding(user_input=word, client=client, model_name=model_name) for word in statistik_woerter]
andere_embeddings = [get_embedding(user_input=word, client=client, model_name=model_name) for word in andere_woerter]



['Mittelwert', 'Median', 'Varianz', 'Standardabweichung', 'Regression',
 'Korrelation', 'Hypothese', 'Konfidenzintervall', 'Signifikanzniveau',
 'P-Wert', 'Stichprobe', 'Zufallsvariable', 'Histogramm', 'Verteilung',
 'Wahrscheinlichkeitsdichte', 'Teststatistik', 'Schätzer', 'Normalverteilung',
 'Streuung', 'Bias', 'Residuum', 'Likelihood', 'Bayes-Theorem', 'Bootstrap',
 'Monte-Carlo-Simulation']
['Baum', 'Katze', 'Energie', 'Sonne', 'Freiheit', 'Buch', 'Wasser', 'Zeit',
 'Musik', 'Liebe', 'Stadt', 'Regen', 'Planet', 'Sprache', 'Technologie', 'Berg',
 'Traum', 'Blume', 'Freundschaft', 'Wissen', 'Licht', 'Straße', 'Natur',
 'Kunst', 'Abenteuer']


Anzeigen der erzeugten Embeddings:

In [2]:
pprint.pp(statistik_embeddings[0], width=80, compact=True)

CreateEmbeddingResponse(data=[Embedding(embedding=[0.017367351800203323, 0.003417022991925478, 0.002336001256480813, 0.006550033576786518, -0.02570311352610588, -0.014555630274116993, 0.012042121961712837, 0.08878224343061447, -0.01082086842507124, -0.04771406576037407, -0.03550153970718384, 0.04396510496735573, -0.034053076058626175, -0.02135772444307804, -0.027691200375556946, 0.009841026738286018, -0.008854083716869354, 0.013007763773202896, -0.009230399504303932, -0.00976292323321104, -0.0299064964056015, -0.029310069978237152, -0.022337567061185837, 0.02397063933312893, -0.0317525751888752, -0.013242073357105255, 0.0011857514036819339, -0.006684939377009869, -0.0008294046856462955, -0.0011893014889210463, 0.00601396057754755, 0.01802058145403862, -0.0017892775358632207, -0.0038767680525779724, -0.03513232246041298, 0.00773223489522934, -0.015152056701481342, -0.0202642772346735, 0.016444312408566475, 0.00944340880960226, -0.006262471433728933, 0.012901258654892445, -0.038029246032

## Visualisiere Embeddings mit trunkierter SVD



In [4]:
import numpy as np

# Arrange the embeddings as large matrix
statistik_matrix = np.vstack(
    [
        statistik_embedding.data[0].embedding
        for statistik_embedding in statistik_embeddings
    ]
)
print(f"Shape statistik_matrix: {statistik_matrix.shape}")

andere_woerter_matrix = np.vstack(
    [statistik_embedding.data[0].embedding for statistik_embedding in andere_embeddings]
)
print(f"Shape andere_woerter_matrix: {andere_woerter_matrix.shape}")

embedding_matrix = np.vstack([statistik_matrix, andere_woerter_matrix])
print(f"Shape embedding_matrix: {embedding_matrix.shape}")

Shape statistik_matrix: (25, 3072)
Shape andere_woerter_matrix: (25, 3072)
Shape embedding_matrix: (50, 3072)


In [5]:
U, S, Vh = np.linalg.svd(embedding_matrix)

print(f"Shapes: U:{U.shape}, S:{S.shape}, Vh:{Vh.shape}")

Shapes: U:(50, 50), S:(50,), Vh:(3072, 3072)


In [6]:
import pandas as pd
import plotly.express as px

data_frame = pd.DataFrame(
    index=statistik_woerter + andere_woerter,
    data={
        "dim_1": (U[:, :1] * S[:1]).flatten().tolist(),
        "dim_2": (U[:, 1:2] * S[1:2]).flatten().tolist(),
        "dim_3": (U[:, 2:3] * S[2:3]).flatten().tolist(),
        "ist_statistik": 25 * [True] + 25 * [False],
    },
)

data_frame.head()

Unnamed: 0,dim_1,dim_2,dim_3,ist_statistik
Mittelwert,-0.610292,-0.226675,0.17482,True
Median,-0.400897,-0.301508,-0.304325,True
Varianz,-0.707379,-0.290686,0.21037,True
Standardabweichung,-0.618274,-0.397856,0.204031,True
Regression,-0.389747,-0.317693,-0.395534,True


In [7]:
px.scatter_3d(
    data_frame=data_frame,
    x="dim_1",
    y="dim_2",
    z="dim_3",
    color="ist_statistik",
    hover_name=data_frame.index,
)

Verlauf der Singulärwerte

In [8]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatter(x=list(range(len(S))), y=S, mode="lines+markers"))
fig.update_layout(
    title="Entries of the vector S over its index",
    xaxis_title="Index",
    yaxis_title="Singulärwert",
)

Visualisierung der Punkte mit Multidimensionaler Skalierung

In [9]:
from sklearn.manifold import MDS

mds_embedding = MDS(n_components=2, normalized_stress="auto")
embeddings_mds = mds_embedding.fit_transform(embedding_matrix)


data_frame = pd.DataFrame(
    index=statistik_woerter + andere_woerter,
    data={
        "dim_1": (embeddings_mds[:, 0]).flatten().tolist(),
        "dim_2": (embeddings_mds[:, 1]).flatten().tolist(),
        "ist_statistik": 25 * [True] + 25 * [False],
    },
)

data_frame.head()

Unnamed: 0,dim_1,dim_2,ist_statistik
Mittelwert,-0.8394,0.185592,True
Median,-0.843115,0.606719,True
Varianz,-0.375959,0.244741,True
Standardabweichung,-0.602052,0.520881,True
Regression,-0.047969,1.022679,True


In [10]:
px.scatter(
    data_frame=data_frame,
    x="dim_1",
    y="dim_2",
    color="ist_statistik",
    hover_name=data_frame.index,
)

## Untersuchung der Cosinusähnlichkeit

In [11]:
from text_analysis.embeddings import calculate_cosine_similarity

total_words = len(statistik_woerter) + len(andere_woerter)
cosine_similarities = np.empty([total_words, total_words])

for i in range(total_words):
    for j in range(total_words):
        cosine_similarities[i, j] = calculate_cosine_similarity(
            a=embedding_matrix[i, :], b=embedding_matrix[j, :]
        )


In [12]:
data_frame = pd.DataFrame(
    cosine_similarities,
    index=statistik_woerter + andere_woerter,
    columns=statistik_woerter + andere_woerter,
)

data_frame.head()

Unnamed: 0,Mittelwert,Median,Varianz,Standardabweichung,Regression,Korrelation,Hypothese,Konfidenzintervall,Signifikanzniveau,P-Wert,...,Berg,Traum,Blume,Freundschaft,Wissen,Licht,Straße,Natur,Kunst,Abenteuer
Mittelwert,1.0,0.48651,0.550001,0.537452,0.19204,0.359557,0.226172,0.44631,0.421048,0.451753,...,0.201883,0.293373,0.238535,0.293821,0.339462,0.301089,0.29672,0.252053,0.255793,0.232665
Median,0.48651,1.0,0.335187,0.353867,0.332256,0.20643,0.154558,0.284113,0.203732,0.24102,...,0.252263,0.174474,0.189562,0.094973,0.149454,0.138279,0.162668,0.175058,0.147255,0.104737
Varianz,0.550001,0.335187,1.0,0.686182,0.279379,0.485599,0.262218,0.528257,0.476291,0.498156,...,0.220394,0.308623,0.289982,0.324965,0.349889,0.284114,0.343491,0.302251,0.284328,0.315432
Standardabweichung,0.537452,0.353867,0.686182,1.0,0.280407,0.391371,0.218271,0.526076,0.473744,0.46762,...,0.1525,0.226499,0.242741,0.248225,0.259426,0.200908,0.327487,0.241152,0.200268,0.257947
Regression,0.19204,0.332256,0.279379,0.280407,1.0,0.372446,0.205279,0.281032,0.247072,0.271767,...,0.196234,0.162352,0.160588,0.129388,0.10391,0.118172,0.165187,0.163776,0.082135,0.121326


In [13]:
px.imshow(data_frame)

Verteilung der erhaltenen Häufigkeiten

In [14]:
cosine_similarities_vector = cosine_similarities[
    ~np.eye(cosine_similarities.shape[0], dtype=bool)
].flatten()
px.histogram(cosine_similarities_vector)

## Klassifikation der Embeddings

In [15]:
from sklearn.cluster import KMeans

k_means_full = KMeans(n_clusters=2)
k_means_3 = KMeans(n_clusters=2)

k_means_labels_full = k_means_full.fit_predict(embedding_matrix)
k_means_labels_3 = k_means_3.fit_predict(np.dot(U[:, :3] * S[:3], Vh[:3, :]))

data_frame = pd.DataFrame(data={'full': k_means_labels_full, 'SVD_3': k_means_labels_3})

px.line(data_frame, x=data_frame.index, y=['full', 'SVD_3'], title="K-Means Labels over Index")
