In this analysis, I apply the same evaluation methods as described in **embeddings_effectivity_final.ipynb**, but to a corpus in which all documents were first **machine-translated into English**.  


In [10]:
import os
import requests
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from google import genai
from google.genai import types
from mistralai import Mistral
import voyageai
import numpy as np
from dotenv import load_dotenv
import time
import itertools, numpy as np, pandas as pd
from sklearn.metrics import (
    v_measure_score, normalized_mutual_info_score,
    adjusted_rand_score, silhouette_score, accuracy_score,
    precision_recall_fscore_support
)
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
import umap
import plotly.express as px
import plotly.graph_objects as go
from kneed import KneeLocator
import seaborn as sns
from google.cloud import translate_v3 as translate
from google.api_core.exceptions import GoogleAPICallError


In [11]:
df = pd.read_parquet(r"..\data\df_to_app_with_openAI_S_L_voyage_gdoogle_mistral_embeddings_navrh_zakona_obdobie_8_core_clear.parquet", engine="fastparquet")

In [58]:
load_dotenv(r"..\keys.env")

True

In [13]:

def save_slovak_embedding_metrics_table(metrics_df, save_path=r"..\graphs\slovak_translate_embedding_effectiveness_metrics.png"):
    """
    Save Slovak parliamentary embedding clustering metrics as publication-ready PNG
    Specialized for 2010-2023 parliamentary transcription corpus analysis
    """
    
    # Method 1: Professional academic table with matplotlib
    fig, ax = plt.subplots(figsize=(16, 8))
    ax.axis('tight')
    ax.axis('off')
    
    # Create styled table for Slovak NLP research
    table = ax.table(cellText=metrics_df.round(3).values,
                    colLabels=['Embedding Model', 'Clustering Method', 'K-Clusters', 
                              'Precision', 'Recall', 'F1-Score', 'V-Measure', 'ARI'],
                    cellLoc='center',
                    loc='center',
                    bbox=[0, 0, 1, 1])
    
    # Slovak parliamentary research styling
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 2.0)
    
    # Header styling for academic publication
    for i in range(len(metrics_df.columns)):
        table[(0, i)].set_facecolor('#2E4057')  # Slovak blue theme
        table[(0, i)].set_text_props(weight='bold', color='white')
        table[(0, i)].set_height(0.1)
    
    # Color-code by embedding performance for Slovak language
    for i in range(1, len(metrics_df) + 1):
        # Highlight best performing embeddings for Slovak text
        f1_score = float(metrics_df.iloc[i-1]['f1'])
        if f1_score > 0.7:  # High performance for Slovak parliamentary text
            for j in range(len(metrics_df.columns)):
                table[(i, j)].set_facecolor('#E8F5E8')  # Light green
        elif f1_score > 0.5:  # Medium performance
            for j in range(len(metrics_df.columns)):
                table[(i, j)].set_facecolor('#FFF8DC')  # Light yellow
        else:  # Low performance for Slovak language
            for j in range(len(metrics_df.columns)):
                table[(i, j)].set_facecolor('#FFE4E1')  # Light red
    
    # Title for Slovak parliamentary research
    plt.title('Embedding Model Effectiveness on Slovak Parliamentary Transcriptions\n'
              'Clustering Performance Analysis (2010-2023 Corpus)', 
              fontsize=16, fontweight='bold', pad=20)
    
    # Add research context subtitle
    plt.figtext(0.5, 0.02, 'Novel NLP Procedures for Slovak Language Analysis | Ministry-based Semantic Clustering',
                ha='center', fontsize=10, style='italic')
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"✅ Slovak embedding metrics table saved to: {save_path}")

In [59]:
openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")
Mistral_API_KEY = os.getenv("MISTRAL_API_KEY")
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")

In [60]:
vo = voyageai.Client(api_key=VOYAGE_API_KEY)
client_mistral = Mistral(api_key=Mistral_API_KEY)
client_google = genai.Client(api_key=google_api_key)
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [35]:
def translate_dataframe_column(df, column_to_translate, target_language, project_id):
    """
    Translates a column in a pandas DataFrame using the Google Cloud Translation API.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_to_translate (str): The name of the column containing the text to translate.
        target_language (str): The language code to translate the text to (e.g., 'en' for English).
        project_id (str): Your Google Cloud Project ID.

    Returns:
        pd.DataFrame: The DataFrame with a new column containing the translated text.
    """
    # Create a Translation client.
    client = translate.TranslationServiceClient()

    # Get the location name for the project.
    location = "global" 
    parent = f"projects/{project_id}/locations/{location}"

    # Initialize a list to hold the translated sentences.
    translated_texts = []

    # Iterate through each row's text and perform the translation.
    # Note: For very large datasets, consider batching requests to optimize API usage.
    print(f"Translating column '{column_to_translate}'...")
    for text in df[column_to_translate]:
        # Handle cases where the text might be empty or not a string.
        if not text or not isinstance(text, str):
            translated_texts.append("")
            continue

        try:
            response = client.translate_text(
                parent=parent,
                contents=[text],
                target_language_code=target_language,
            )
            # The API returns a list of translations. We just need the first one.
            translated_text = response.translations[0].translated_text
            translated_texts.append(translated_text)
        except Exception as e:
            print(f"An error occurred during translation: {e}")
            translated_texts.append("Translation Error")

    # Add the new column with the translated text to the DataFrame.
    new_column_name = f"{column_to_translate}_en"
    df[new_column_name] = translated_texts
    
    return df


In [75]:
def embed_single_mistral(text, model="mistral-embed", lock=None, max_retries=5, base_wait=5):
    """
    Generate Mistral embeddings for Slovak parliamentary transcription effectiveness analysis
    Returns only the embedding vector for 2010-2023 corpus research
    """
    if not isinstance(text, str) or not text.strip():
        return None
    
    # Check if client is properly initialized
    global client_mistral
    if client_mistral is None:
        print("❌ Mistral client not initialized for Slovak parliamentary analysis")
        return None
    
    for attempt in range(max_retries):
        try:
            response = client_mistral.embeddings.create(
                model=model, 
                inputs=[text]  # Correct parameter for Mistral API
            )
            
            # Extract only the embedding vector from response
            if hasattr(response, "data") and isinstance(response.data, list) and len(response.data) > 0:
                # Get the embedding object
                embedding_obj = response.data[0]
                
                # Extract vector from embedding object
                if hasattr(embedding_obj, 'embedding'):
                    return embedding_obj.embedding  # Return just the vector
                elif isinstance(embedding_obj, dict) and 'embedding' in embedding_obj:
                    return embedding_obj['embedding']  # Return just the vector
                else:
                    print(f"⚠️ Unexpected Mistral response structure: {type(embedding_obj)}")
                    return None
            else:
                print("⚠️ No embeddings in Mistral response for Slovak parliamentary text")
                return None
                
        except Exception as e:
            if "rate limit" in str(e).lower() or "429" in str(e):
                wait = base_wait * (2 ** attempt)
                print(f"Rate limit hit. Waiting {wait}s before retrying (attempt {attempt+1}/{max_retries})...")
                time.sleep(wait)
            else:
                print(f"Mistral embedding error: {e}")
                return None
    
    print("Max retries reached for Mistral embedding. Skipping.")
    return None

In [18]:
def embed_truncated_prepis_voyage(df, model, input_type, output_dimension, text_column, out_column, batch_size=25, delay=60):
    vo = voyageai.Client()
    texts = df[text_column].tolist()
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        try:
            result = vo.embed(
                batch,
                model=model,
                input_type=input_type,
                output_dimension=output_dimension
            )
            embeddings.extend(result.embeddings)
        except voyageai.error.RateLimitError:
            print("Rate limit hit, waiting before retrying...")
            time.sleep(delay)
            # Retry the same batch
            result = vo.embed(
                batch,
                model=model,
                input_type=input_type,
                output_dimension=output_dimension
            )
            embeddings.extend(result.embeddings)
    df[out_column] = embeddings
    return df

In [19]:
def embed_gemini_google(df, text_column, out_column="gemini-embedding-exp-03-07", output_dim=3072, batch_size=16):
    """
    Embed texts from a DataFrame column using Gemini (Google) embedding API.
    """
    client = genai.Client()
    embeddings = []
    texts = df[text_column].tolist()
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        for text in batch:
            try:
                result = client.models.embed_content(
                    model="gemini-embedding-001",
                    contents=text,
                    config=types.EmbedContentConfig(output_dimensionality=output_dim)
                )
                embeddings.append(result.embedding)
            except Exception as e:
                print(f"Embedding error: {e}")
                embeddings.append(None)
    df[out_column] = embeddings
    return df

In [20]:
def embed_openai_batch(df, text_column, out_column, model="text-embedding-3-large", batch_size=100, client=None):
    """
    Embed texts in batches using OpenAI API (works for both small and large models).
    """
    if client is None:
        from openai import OpenAI
        import os
        client = OpenAI(api_key=os.getenv('open_API_KEY'))
    texts = df[text_column].tolist()
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        try:
            response = client.embeddings.create(input=batch, model=model)
            batch_embeddings = [item.embedding for item in response.data]
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"OpenAI embedding error: {e}")
            embeddings.extend([None]*len(batch))
    df[out_column] = embeddings
    return df

In [76]:
def embed_mistral_rowwise(df, model="mistral-embed", text_column="truncated_prepis_en", out_column="mistral_embedings_eng"):
    """
    Generate Mistral embeddings for Slovak parliamentary effectiveness analysis
    Embeds each row individually with progress tracking for 2010-2023 corpus
    Returns only embedding vectors (not embedding objects)
    """
    embeddings = []
    successful_embeddings = 0
    
    for text in tqdm(df[text_column], desc="Mistral embedding for Slovak parliament"):
        emb = embed_single_mistral(text, model)
        embeddings.append(emb)
        if emb is not None:
            successful_embeddings += 1
    
    df[out_column] = embeddings
    print(f"✅ Mistral embeddings: {successful_embeddings}/{len(embeddings)} Slovak parliamentary speeches processed")
    return df



In [68]:
def embed_gemini_google(df, text_column, out_column="gemini-embedding-exp-03-07_eng", output_dim=3072, batch_size=16):
    """
    Generate Gemini embeddings for Slovak parliamentary transcription effectiveness analysis
    Specialized for English-translated Slovak parliamentary speeches (2010-2023 corpus)
    Novel procedures for cross-linguistic embedding evaluation on Slovak language data
    """
    client = genai.Client()
    embeddings = []
    texts = df[text_column].tolist()
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Gemini embedding for Slovak parliament"):
        batch = texts[i:i+batch_size]
        for text in batch:
            try:
                # Correct API call for Gemini embeddings
                result = client.models.embed_content(
                    model="gemini-embedding-001",
                    contents=text,
                    config=types.EmbedContentConfig(output_dimensionality=output_dim)
                )
                
                # Fix: Extract embedding vector from response
                # The embedding is in result.embeddings[0].values
                if hasattr(result, 'embeddings') and len(result.embeddings) > 0:
                    embedding_vector = result.embeddings[0].values
                    embeddings.append(embedding_vector)
                else:
                    print(f"No embeddings found in response for Slovak parliamentary text")
                    embeddings.append(None)
                    
            except Exception as e:
                print(f"Gemini embedding error for Slovak parliamentary analysis: {e}")
                embeddings.append(None)
    
    df[out_column] = embeddings
    print(f"✅ Gemini embeddings generated for {len([e for e in embeddings if e is not None])}/{len(embeddings)} Slovak parliamentary speeches")
    return df

In [23]:
# --- OpenAI embedding function (works for both small and large) ---
def embed_openai_batch(df, text_column, out_column, model="text-embedding-3-large", batch_size=100, client=None):
    if client is None:
        client = OpenAI(api_key=os.getenv('open_API_KEY'))
    texts = df[text_column].tolist()
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"OpenAI {model} embedding"):
        batch = texts[i:i+batch_size]
        try:
            response = client.embeddings.create(input=batch, model=model)
            batch_embeddings = [item.embedding for item in response.data]
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"OpenAI embedding error: {e}")
            embeddings.extend([None]*len(batch))
    df[out_column] = embeddings
    return df

In [36]:
df = translate_dataframe_column(df, column_to_translate="truncated_prepis", target_language="en", project_id=PROJECT_ID)

Translating column 'truncated_prepis'...


In [40]:
df['truncated_prepis_en']

191      32.. Mr. Fico, I am the one who removed Harabi...
197      96.Good afternoon. Dear Minister, Dear Preside...
234      10..I am speaking in the debate on this agenda...
244      66..Before I say when this will be voted on, I...
332      253.. Mr. Minister, I am not that fast. Dear M...
                               ...                        
19888    34., Mr. Chairman, colleagues, I also consider...
19890    18.. Dear Mr. Chairman, Dear Mr. Minister, col...
19892    113..Dear Mr. Chairman, Mr. Minister, esteemed...
19893    69., Mr. Chairman, esteemed colleagues, collea...
19915    162.. In the first reading, I spoke at length,...
Name: truncated_prepis_en, Length: 522, dtype: object

In [77]:
df = embed_mistral_rowwise(df)


Mistral embedding for Slovak parliament: 100%|██████████| 522/522 [02:03<00:00,  4.24it/s]

✅ Mistral embeddings: 522/522 Slovak parliamentary speeches processed





In [None]:
df = embed_truncated_prepis_voyage(df, model="voyage-3-large", input_type="document", output_dimension=2048, text_column="truncated_prepis_en", out_column="voyage-3-large_embeddings_eng")

In [69]:
# Gemini (Google) embedding
df = embed_gemini_google(df, text_column="truncated_prepis_en", out_column="gemini-embedding-exp-03-07_eng")

Gemini embedding for Slovak parliament: 100%|██████████| 33/33 [02:53<00:00,  5.26s/it]

✅ Gemini embeddings generated for 522/522 Slovak parliamentary speeches





In [None]:

# OpenAI large embedding
df = embed_openai_batch(df, text_column="truncated_prepis_en", out_column="openAI_embedding_3076_eng", model="text-embedding-3-large")

OpenAI text-embedding-3-large embedding: 100%|██████████| 6/6 [00:54<00:00,  9.11s/it]


In [73]:
# OpenAI small embedding
df = embed_openai_batch(df, text_column="truncated_prepis_en", out_column="openAI_embedding_small_eng", model="text-embedding-3-small")

OpenAI text-embedding-3-small embedding: 100%|██████████| 6/6 [00:11<00:00,  1.91s/it]


In [79]:
df.to_parquet(r"..\data\df_to_app_with_openAI_S_L_voyage_gdoogle_mistral_embeddings_navrh_zakona_obdobie_8_core_clear_eng.parquet", engine="pyarrow")

In [104]:
# Configuration for English embedding evaluation
EMB_COLS_ENG = [
    "gemini-embedding-exp-03-07_eng",
    "openAI_embedding_3076_eng", 
    "openAI_embedding_small_eng",
    "mistral_embedings_eng",  
    "voyage-3-large_embeddings_eng"   
]

LABEL_COL = "Predkladateľ"
TEXT_COL = "translated_text"  # English translated text column
SEED = 0
K_ELBOW_MAX = 15
UMAP_NN = 15
UMAP_DIST = 0.05

In [81]:
def cluster_metrics(y_true, y_pred):
    """
    Precision / recall / F1 after majority‑vote mapping
    (now works whether y_pred is ndarray or Series)
    """
    # ensure Pandas objects
    if not isinstance(y_true, pd.Series):
        y_true = pd.Series(y_true)          # safety
    if not isinstance(y_pred, pd.Series):
        y_pred = pd.Series(y_pred, index=y_true.index)

    # contingency table: rows = clusters, cols = true labels
    ct = pd.crosstab(y_pred, y_true)

    # majority label for every cluster
    best_label = ct.idxmax(axis=1).to_dict()       # cluster → label

    # convert prediction to that majority label (majority vote)
    y_map = y_pred.map(best_label)

    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_map, average="macro", zero_division=0
    )
    return dict(precision=prec, recall=rec, f1=f1,
                v_measure=v_measure_score(y_true, y_pred),
                ari=adjusted_rand_score(y_true, y_pred))

In [82]:

def elbow_k(X, k_max=K_ELBOW_MAX):
    sil = []
    for k in range(2, k_max+1):
        lab = KMeans(k, n_init=20, random_state=0).fit_predict(X)
        sil.append(silhouette_score(X, lab, metric="cosine"))
    k_star = np.argmax(sil) + 2       # highest silhouette
    return k_star, np.arange(2, k_max+1), np.array(sil)

In [83]:
def evaluate_and_write(df, emb_col, label_col="Predkladateľ",
                       k_fixed=None, k_max=None,
                       write_prefix="cluster"):
    """
    Cluster Slovak parliamentary speeches and add cluster columns to DataFrame
    Returns both metrics and the updated DataFrame with cluster columns
    """
    X = np.vstack(df[emb_col]).astype("float32")
    y = df[label_col].astype("category").cat.codes

    # ── elbow variant ────────────────────────────────────────────
    k_star, ks, sils = elbow_k(X, k_max)
    lab_elbow = KMeans(k_star, n_init=20, random_state=SEED).fit_predict(X)
    df[f"{write_prefix}_{emb_col}_elbow"] = lab_elbow

    # ── fixed‑k variant (k_fixed = real #labels) ────────────────
    if k_fixed is None:
        k_fixed = df[label_col].nunique()
    lab_fixed = KMeans(k_fixed, n_init=20, random_state=SEED).fit_predict(X)
    df[f"{write_prefix}_{emb_col}_fixed"] = lab_fixed

    # ── metrics (using the same cluster_metrics() as before) ────
    m_elbow = cluster_metrics(y, lab_elbow) | {"k": k_star}
    m_fixed = cluster_metrics(y, lab_fixed) | {"k": k_fixed}

    # Return both metrics AND the updated DataFrame
    return {
        "elbow": m_elbow,
        "fixed": m_fixed,
        "elbow_curve": (ks, sils),
        "lab_elbow": lab_elbow,      
        "lab_fixed": lab_fixed,
        "df_updated": df          # <<<< NOW RETURNS UPDATED DataFrame
    }

In [None]:
k_real = df[LABEL_COL].nunique()
store = {}
df_cores_with_clusters = {}  # Store updated df_core with cluster columns

print(f"🏛️ SLOVAK PARLIAMENTARY CLUSTERING ANALYSIS")
print(f"📊 Real ministries (Predkladateľ): {k_real}")
print(f"🔍 Search range: k=2 to k={K_ELBOW_MAX}")
print("="*60)

for emb in EMB_COLS_ENG:
    mask_col = f"core_{emb[:-4]}"
    if mask_col not in df.columns:
        raise ValueError(f"Missing mask column '{mask_col}' – "
                         "run add_core_masks() first.")
    
    df_core = df[df[mask_col]].copy()  # Keep only core speeches
    print(f"▶ {emb}  |  rows kept: {len(df_core)}")

    # Get clustering results
    result = evaluate_and_write(
        df_core,
        emb_col=emb,
        label_col=LABEL_COL,
        k_fixed=k_real,
        k_max=K_ELBOW_MAX,
        write_prefix="cluster"
    )
    
    store[emb] = result
    df_cores_with_clusters[emb] = result["df_updated"]  # Store updated df_core
    

# Enhanced table of metrics with k values shown
rows = []
for emb, d in store.items():
    rows.append({
        "embedding": emb, 
        "mode": "elbow", 
        "k_used": d["elbow"]["k"],     # Show actual k found
        **{k:v for k,v in d["elbow"].items() if k != "k"}
    })
    rows.append({
        "embedding": emb, 
        "mode": "fixed", 
        "k_used": d["fixed"]["k"],     # Show actual k used
        **{k:v for k,v in d["fixed"].items() if k != "k"}
    })

metrics_df = pd.DataFrame(rows).round(3)

print("\n📈 CLUSTERING EFFECTIVENESS RESULTS:")
print("(k_used = actual number of clusters found/used)")
display(metrics_df)

# Validation: Check if elbow k matches visualization peaks
print("\n🔍 ELBOW METHOD VALIDATION:")
for emb, d in store.items():
    ks, sils = d["elbow_curve"]
    peak_k = ks[np.argmax(sils)]
    reported_k = d["elbow"]["k"]
    match = "✅" if peak_k == reported_k else "❌"
    print(f"{match} {emb}: peak at k={peak_k}, reported k={reported_k}")
