In [1]:
import argparse
import os
import time
import numpy as np
import pandas as pd

from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.cluster import KMeans
from itertools import permutations

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Twitter-Roberta in eval mode
tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = RobertaModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [3]:
device

device(type='cuda')

In [5]:
def get_cls_embedding(tweet):
    """
    Return the CLS embedding from the last hidden state of RoBERTa
    """
    encoded_input = tokenizer(tweet, 
                              return_tensors='pt', 
                              truncation=True, 
                              max_length=512)
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    
    with torch.no_grad():
        output = model(**encoded_input, output_hidden_states=True)
        last_layer_hidden_states = output.hidden_states[-1]  # shape: (batch, seq_len, hidden_dim)
        cls_embedding = last_layer_hidden_states[:, 0, :]    # shape: (batch, hidden_dim)
    return cls_embedding.cpu().numpy().squeeze()


In [6]:
data_path = "/home/path1/path2/ssbrl/data/learning-to-slice/US_election_dataset.csv"
output_path = "labled_data/kmeans_labelled.csv"

In [12]:
df = pd.read_csv(data_path)
df = df.drop_duplicates(subset="index_text", keep="first").reset_index(drop=True)

# Subset of labeled rows for discovering the best mapping
df_labeled = df[df["is_gt"] == 1].copy()
if len(df_labeled) == 0:
    raise ValueError("No labeled rows found (is_gt==1). Cannot find best mapping!")

df_labeled = df_labeled[df_labeled["manual_label"] != "neutral"]

# We assume df_labeled["manual_label_full"] has exactly 4 possible classes
unique_labels = df_labeled["manual_label_full"].unique()
unique_labels = sorted(unique_labels)  # ensure consistent order
if len(unique_labels) != 4:
    print("Warning: found these ground-truth labels in the labeled set:", unique_labels)
    print("But we are expecting exactly 4 distinct classes. Proceeding anyway...")

# Map label string -> integer index (0..3)
label2idx = {lab: i for i, lab in enumerate(unique_labels)}
idx2label = {v: k for k, v in label2idx.items()}

In [13]:
# Inspections:
print(label2idx)
print(idx2label)

{'opposing-Candidate_Advocacy': 0, 'opposing-Election_Legitimacy': 1, 'supportive-Candidate_Advocacy': 2, 'supportive-Election_Legitimacy': 3}
{0: 'opposing-Candidate_Advocacy', 1: 'opposing-Election_Legitimacy', 2: 'supportive-Candidate_Advocacy', 3: 'supportive-Election_Legitimacy'}


In [14]:
# --------------------------
# Embed ALL TWEETS
# --------------------------
print(f"Embedding all {len(df)} tweets...")
start_time = time.time()
embeddings = []
for text in df["text"]:
    emb = get_cls_embedding(text)
    embeddings.append(emb)
embeddings_array = np.vstack(embeddings)  # shape: (N, hidden_dim)

Embedding all 4159 tweets...


In [15]:
# --------------------------
# KMeans (4 clusters)
# --------------------------
print("Running KMeans(n_clusters=4)...")
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans.fit(embeddings_array)
end_time = time.time()
print(f"Done embedding. Time: {end_time - start_time:.2f}s")
cluster_ids = kmeans.labels_  # cluster assignments for each row in df

Running KMeans(n_clusters=4)...
Done embedding. Time: 22.62s


In [16]:
# Store cluster in df
df["cluster_id"] = cluster_ids
np.unique(cluster_ids)

array([0, 1, 2, 3], dtype=int32)

In [19]:
# --------------------------
#  Find best permutation using ONLY the labeled subset
# --------------------------
# (1) We'll extract the cluster assignments for the labeled portion
labeled_indices = df_labeled.index  # row indices in df
labeled_clusters = df.loc[labeled_indices, "cluster_id"].values

# (2) Convert ground-truth label -> label_idx
df_labeled["label_idx"] = df_labeled["manual_label_full"].map(label2idx)
labeled_gt = df_labeled["label_idx"].values  # ground truth array (0..3)

best_accuracy = -1.0
best_mapping = {}

# Permutations of [0,1,2,3]
for perm in permutations(range(4)):
    # clusterID -> label_idx
    cluster2label = {cid: perm[cid] for cid in range(4)}

    # Predicted label for the labeled portion
    pred_label_idx = [cluster2label[c] for c in labeled_clusters]
    
    # Compute accuracy among labeled data
    correct = sum(
        1 for gt, pred in zip(labeled_gt, pred_label_idx) if gt == pred
    )
    accuracy = correct / len(labeled_gt)
    print(f"perm: {perm}")
    print(f"----acc: {accuracy}")
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_mapping = cluster2label

print(f"Best accuracy among labeled data = {best_accuracy:.4f}")
print(f"Best cluster->label mapping: {best_mapping}")

perm: (0, 1, 2, 3)
----acc: 0.24
perm: (0, 1, 3, 2)
----acc: 0.22
perm: (0, 2, 1, 3)
----acc: 0.30333333333333334
perm: (0, 2, 3, 1)
----acc: 0.24
perm: (0, 3, 1, 2)
----acc: 0.29
perm: (0, 3, 2, 1)
----acc: 0.24666666666666667
perm: (1, 0, 2, 3)
----acc: 0.29
perm: (1, 0, 3, 2)
----acc: 0.27
perm: (1, 2, 0, 3)
----acc: 0.2866666666666667
perm: (1, 2, 3, 0)
----acc: 0.22666666666666666
perm: (1, 3, 0, 2)
----acc: 0.2733333333333333
perm: (1, 3, 2, 0)
----acc: 0.23333333333333334
perm: (2, 0, 1, 3)
----acc: 0.3
perm: (2, 0, 3, 1)
----acc: 0.23666666666666666
perm: (2, 1, 0, 3)
----acc: 0.23333333333333334
perm: (2, 1, 3, 0)
----acc: 0.17333333333333334
perm: (2, 3, 0, 1)
----acc: 0.24
perm: (2, 3, 1, 0)
----acc: 0.24333333333333335
perm: (3, 0, 1, 2)
----acc: 0.29333333333333333
perm: (3, 0, 2, 1)
----acc: 0.25
perm: (3, 1, 0, 2)
----acc: 0.22666666666666666
perm: (3, 1, 2, 0)
----acc: 0.18666666666666668
perm: (3, 2, 0, 1)
----acc: 0.24666666666666667
perm: (3, 2, 1, 0)
----acc: 0.25
B

In [20]:
# --------------------------
#  Final predictions for ALL data
# --------------------------
final_pred_idx = [best_mapping[c] for c in df["cluster_id"]]
pred_labels_str = [idx2label[i] for i in final_pred_idx]
df["pred_label"] = pred_labels_str

# (Optional) measure final accuracy on the labeled subset (should match best_accuracy)
labeled_pred_idx = [best_mapping[c] for c in labeled_clusters]
correct_final = sum(1 for gt, pred in zip(labeled_gt, labeled_pred_idx) if gt == pred)
final_accuracy = correct_final / len(labeled_gt)
print(f"Final accuracy on labeled data = {final_accuracy:.4f}")

Final accuracy on labeled data = 0.3033


In [21]:
# --------------------------
#  Save results
# --------------------------
if output_path is not None:
    df.to_csv(output_path, index=False)
    print(f"Saved final results (including pred_label) to {output_path}")
else:
    print("No output_path given. Not saving CSV.")

Saved final results (including pred_label) to labled_data/kmeans_labelled.csv
