In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import pandas as pd
from scipy.special import softmax
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
def get_cls_embedding(tweet):
    #model.save_pretrained(MODEL)
    # text = "Covid cases are increasing fast! Covid cases are increasing fast! "
    # text = preprocess(text)
    encoded_input = tokenizer(tweet, return_tensors='pt')

    output = model(**encoded_input, output_hidden_states=True)
    # print("output:")
    # print(len(output.hidden_states))
    # print(output.hidden_states[-1])

    last_layer_hidden_states = output.hidden_states[-1]
    # Option 1: Use the embedding of the [CLS] token (index 0)
    cls_embedding = last_layer_hidden_states[:, 0, :]
    # print(cls_embedding.shape)
    # print(cls_embedding)
    return cls_embedding.detach().numpy()




Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
from sklearn.cluster import KMeans

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

csv_path = "/Users/ruipenghan/projects/research/ssbrl/data/paper_data_merged/EDCA_filtered_data_10_5_5_20000_tree_vis_edca_labeled_merged.csv"
data = pd.read_csv(csv_path)

data = data.drop_duplicates(subset="index_text", keep="first")
data = data.drop_duplicates(subset="text", keep="first")
tweets = data["text"].tolist()
index_text = data["index_text"].tolist()
assert(len(tweets) == len(index_text))
embeddings = []
# i = 0
for tweet in tweets:
    # if i == 5:
        # break
    embeddings.append(get_cls_embedding(tweet))
    # i += 1


# RUN KMEANS on cls_embedding
# Convert the list of embeddings to a numpy array
embeddings_array = np.array(embeddings)

# Reshape (5, 1, 768) to (5, 768)
embeddings_array = np.squeeze(embeddings_array)

# Create an instance of KMeans with 2 clusters
kmeans = KMeans(n_clusters=2)

# Fit the KMeans model to the embeddings
kmeans.fit(embeddings_array)

# Get the cluster labels for each embedding
cluster_labels = kmeans.labels_

# Print the cluster labels
# print(cluster_labels)
clustered_data = pd.read_csv(csv_path)
for i, label in enumerate(cluster_labels):
    km_label = "opposing"
    if label == 0:
        km_label = "supportive"
    idx_text = index_text[i]
    # print((clustered_data['index_text'] == idx_text).sum())
    # exit(1)
    clustered_data.loc[clustered_data['index_text'] == idx_text, 'gpt_label'] = km_label

folder_path = "labelled/edca"

# Create the folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Define the file path
file_path = os.path.join(folder_path, "clustered_data.csv")

# Save the dataframe as a CSV file
clustered_data.to_csv(file_path, index=False)


In [None]:
# scores = output[0][0].detach().numpy()
# scores = softmax(scores)

# ranking = np.argsort(scores)
# ranking = ranking[::-1]
# for i in range(scores.shape[0]):
#     l = config.id2label[ranking[i]]
#     s = scores[ranking[i]]
#     print(f"{i+1}) {l} {np.round(float(s), 4)}")