In [None]:
from config import settings
from pgvector.psycopg2 import register_vector
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from tqdm import tqdm
import chromadb
import gc
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
import psycopg2
import time
import torch
import xgboost as xgb
import create_db
import json

In [23]:
DIMENSION_REDUCTION = True # Ob PCA zur Reduktion auf 128 Dimensionen genutzt werden soll
PRINT_RESULT = False # Ob die Ergebnisse waehrend des Trainings ausgegeben werden sollen
VISUALIZE_EMBEDDINGS = False # Die Embeddings mittels PCA im 3D-Plot anzeigen

SAVE_EMBEDDINGS_TO_DB = True # Die Embeddings in die Datenbank speichern
SAVE_RAW_TEXT_TO_DB = True # Den rohen Text in die Datenbank speichern

In [None]:
# Erstellt bemoetigte Tabellen + loescht gleichnamige, wenn bereits vorhanden
create_db.create_tables() 

In [25]:
df = pl.read_parquet("../data/amazon.parquet")

In [26]:
df

review_text,rating_score
str,i64
"""I registered on the website, t…",1
"""Had multiple orders one turned…",1
"""I informed these reprobates th…",1
"""I have bought from Amazon befo…",1
"""If I could give a lower rate I…",1
…,…
"""I have had perfect order fulfi…",5
"""I have had perfect order fulfi…",5
"""I always find myself going bac…",3
"""I have placed an abundance of …",5


In [27]:
#df = df.with_columns((pl.col("rating_score") >= 3).alias("satisfied").cast(pl.Int32)) # Binary Classification

# multi class classification
df = df.with_columns(
    pl.when(pl.col("rating_score") == 1)
    .then(0)
    .when(pl.col("rating_score").is_between(2, 3))
    .then(1)
    .when(pl.col("rating_score").is_between(4, 5))
    .then(2)
    .otherwise(None)
    .alias("satisfied")
)

In [28]:
df.select("satisfied").to_series().value_counts()

satisfied,count
i32,u32
1,2112
2,5820
0,13123


In [29]:
def connect_to_db() -> None:
    return psycopg2.connect(
        dbname=settings.db_name,
        user=settings.user_name,
        password=settings.password,
        host=settings.host_ip,
        port=settings.host_port
    )

In [None]:
def get_embeddings_from_db_by_modelname(model_name: str) -> np.ndarray:
    conn = connect_to_db()
    cur = conn.cursor()
    
    cur.execute(f"SELECT embedding FROM model_embeddings WHERE model_name='{model_name}'")
    records = cur.fetchall()
    conn.close()

    embeddings = [np.array(json.loads(record[0])) for record in records]
    
    embeddings = np.stack(embeddings)
    
    return embeddings


In [None]:
# legacy not used, because of curse of dimensionality
def evaluate_logistic_regression(embeddings : np.ndarray, time_result : list) -> None:    

    pca = PCA(n_components=6)
    embeddings_d = pca.fit_transform(embeddings)
    
    labels = df.select('satisfied').to_series().to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(embeddings_d, labels, test_size=0.2, random_state=42)
    
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    confusion_matricies.append(confusion_matrix(y_test, y_pred))
    times.append(time_result)

    if PRINT_RESULT:
        print(f"Precision: {precision_score(y_test, y_pred, average='macro')}")
        print(f"Recall: {recall_score(y_test, y_pred, average='macro')}")
        print(f"F1-Score: {f1_score(y_test, y_pred, average='macro')}")
        print(f"Time: {time_result} seconds")

In [32]:
def reduce_embeddings_with_pca(embeddings : np.ndarray) -> np.ndarray:
    pca = PCA(n_components=128)
    reduced_embeddings = pca.fit_transform(embeddings)
    return reduced_embeddings

In [33]:
def evaluate_xgb(embeddings : np.ndarray) -> None:    
    
    labels = df.select('satisfied').to_series().to_numpy()

    if DIMENSION_REDUCTION:
        embeddings = reduce_embeddings_with_pca(embeddings)
    
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)



    classes_weights = class_weight.compute_sample_weight(
        class_weight='balanced',
        y=y_train
    )

    

    model = xgb.XGBClassifier(n_jobs=-1)
    model.fit(X_train, y_train, sample_weight=classes_weights)

    y_pred = model.predict(X_test)

    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    confusion_matricies.append(confusion_matrix(y_test, y_pred))

In [None]:
texts = df.select('review_text').to_series().to_list()

precision_scores = []
recall_scores = []
f1_scores = []
confusion_matricies = []
times = []
memory_usages = []

model_names = [
    'sentence-transformers/all-mpnet-base-v2',
    'thenlper/gte-small',
    'sentence-transformers/paraphrase-MiniLM-L3-v2',
    'intfloat/multilingual-e5-large-instruct',
    'sentence-transformers/all-MiniLM-L6-v2',
  
]

embedding_list = []
start = 0
end = 0

for model_name in tqdm(model_names):
    
    model = SentenceTransformer(model_name)
    start = time.time()
    # to embeddings from text data
    embeddings = model.encode(texts)
    end = time.time()
    
    embedding_list.append(embeddings)
    
    

    result = end - start
    times.append(result)

    memory = torch.cuda.mem_get_info()
    
    #Converting to GiB
    memory_usages.append((memory[1]-memory[0])/1024**3) 

    # del model, garbage collection and clearing mem from gpu
    del model
    gc.collect()
    torch.cuda.empty_cache()

    if VISUALIZE_EMBEDDINGS:
        pca = PCA(n_components=3)
        embeddings_3d = pca.fit_transform(embeddings)
        
        df_embeddings = pd.DataFrame(embeddings_3d, columns=['PCA1', 'PCA2', 'PCA3'])
        df_embeddings['Model'] = name 
        
        fig = px.scatter_3d(
            df_embeddings,
            x='PCA1',
            y='PCA2',
            z='PCA3',
            color='Model',
            title=f'3D PCA Visualisierung der Embeddings ({name})',
            opacity=0.7,
            labels={'PCA1': 'Hauptkomponente 1', 'PCA2': 'Hauptkomponente 2', 'PCA3': 'Hauptkomponente 3'}
        )
        
        fig.show()

100%|██████████| 5/5 [12:55<00:00, 155.05s/it]


In [35]:
if SAVE_RAW_TEXT_TO_DB:
    conn = connect_to_db()
    register_vector(conn)  
    
    cur = conn.cursor()
    for sentence in texts:
        cur.execute(
            "INSERT INTO raw_text (text) VALUES (%s)",
            (sentence,) 
        )
    conn.commit()
    cur.close()
    conn.close()

In [36]:
if SAVE_EMBEDDINGS_TO_DB:
    conn = connect_to_db()
    register_vector(conn)  

    cur = conn.cursor()
    for i, name in enumerate(model_names):
        for j, embedding in enumerate(embedding_list[i]):
            cur.execute(
                "INSERT INTO model_embeddings (embedding, model_name, embedding_id) VALUES (%s, %s, %s)",
                (embedding, name, j+1)
            )
    conn.commit()
    cur.close()
    conn.close()


In [37]:
for model_name in tqdm(model_names): 
    embeddings = get_embeddings_from_db_by_modelname(model_name)
    evaluate_xgb(embeddings)

df_results = pl.DataFrame({"model":model_names, "precision":precision_scores, "recall":recall_scores, "f1":f1_scores,"time":times, "memory":memory_usages})

100%|██████████| 5/5 [00:45<00:00,  9.01s/it]


In [38]:
df_results

model,precision,recall,f1,time,memory
str,f64,f64,f64,f64,f64
"""sentence-transformers/all-mpne…",0.348863,0.351164,0.345397,143.05407,1.497314
"""thenlper/gte-small""",0.428814,0.425977,0.424747,46.448403,0.772705
"""sentence-transformers/paraphra…",0.418115,0.421572,0.409677,9.14269,0.307861
"""intfloat/multilingual-e5-large…",0.760764,0.723434,0.733396,537.928978,3.286377
"""sentence-transformers/all-Mini…",0.680932,0.645701,0.645301,22.116468,0.425049


In [39]:
metrics = ["precision", "recall", "f1"]

In [40]:
data=[]
for row in df_results.iter_rows(named=True):
    data.append(go.Bar(name=row["model"], x=metrics, y=[row["precision"], row["recall"], row["f1"]], text=[row["precision"], row["recall"], row["f1"]]))

fig = go.Figure(data, layout_yaxis_range=[0.5,1.0])
fig.update_traces(texttemplate='%{text:.3}', textposition='outside')
if DIMENSION_REDUCTION:
    fig.update_layout(barmode='group', title="Performance der Embedding Modelle (PCA - 128)")
else:
    fig.update_layout(barmode='group', title="Performance der Embedding Modelle")
fig.show()

In [41]:
data=[]
for row in df_results.iter_rows(named=True):
    data.append(go.Bar(name=row["model"], x=["memory"], y=[row["memory"]], text=row["memory"]))

fig = go.Figure(data)
fig.update_traces(texttemplate='%{text:.2}', textposition='outside')
fig.update_layout(barmode='group', xaxis_title="Model", yaxis_title="Speicherauslastung (GiB)", title="VRAM-Verbrauch der Modelle")
fig.show()

In [42]:
data=[]
for row in df_results.iter_rows(named=True):
    data.append(go.Bar(name=row["model"], x=["time"], y=[row["time"]], text=row["time"]))

fig = go.Figure(data)
fig.update_traces(texttemplate='%{text:.4}', textposition='outside')
fig.update_layout(barmode='group', xaxis_title="model", yaxis_title="Zeit in Sekunden", title="Zeit für die Umformung in Vektoren")
fig.show()