In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import pandas as pd
from scipy.special import softmax
from multiprocessing import Pool, cpu_count # for parallel processing
import torch

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

device = torch.device("mps") 
model.to(device) # move to gpu

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [2]:
# helper functions
# def process_chunk(texts):
#     """Process a chunk of texts and return augmented texts."""
#     augmented_texts = [augment_query(text, model, tokenizer, config) for text in texts]
#     return augmented_texts

# def parallel_process(df, function, n_cores=cpu_count()):
#     """Split DataFrame into chunks and process each chunk in parallel."""
#     df_split = np.array_split(df, n_cores)
#     pool = Pool(n_cores)
#     df = pd.concat(pool.map(function, [chunk["text"].tolist() for chunk in df_split]))
#     pool.close()
#     pool.join()
#     return df

def listostring(s):
    """Converts a list of strings into a single string

    Args:
        s (List): A list of strings
    """
    str1 = " "
    output = str1.join(s)
    return(output)

def augment_query(query, model, tokenizer, config, split_parts=4, device="cpu"):

    query = query.split(" ") # separate into words
    # print(query)
    divisor = 0 
    split_index = len(query) // split_parts if len(query) // split_parts > 0 else 1  # Ensure no zero division

    # divide the query into split_parts
    divided_query = []
    while divisor < len(query):
        divided_query.append(query[divisor:divisor+split_index])
        divisor += split_index

    divided_query = [listostring(element) for element in divided_query] # convert the list of lists into a list of strings

    visual_augmented_input = ""
    augmented_input = ""

    for phrase in divided_query:
        encoded_input = tokenizer(phrase, return_tensors='pt').to(device) # includes both the input_ids and the attention_mask
        output = model(**encoded_input)
        scores = output.logits[0].detach().cpu().numpy()
        scores = softmax(scores)

        ranking = np.argsort(scores)
        ranking = ranking[::-1] # 0 stands for negative, 1 stands for neutral, 2 stands for positive

        visual_augmented_input += phrase + " " + f"[{config.id2label[ranking[0]]}]" + "\n" # for visualization
        augmented_input += phrase + " " + f"[{config.id2label[ranking[0]]}]" + " " 
    
    return augmented_input

## Augmenting Train Data

Splitting into 4 - 5 parts of the sentence.

In [3]:
original_path = "data/train_2024.csv"
augmented_path = "augmented_data/train_2024.csv"

# Load the original CSV file
df = pd.read_csv(original_path, quoting=3) # test for the first 1000 first
print(f"Data loaded from {original_path}")
# print(df)

# Apply the augmentation function to the 'text' column
print("Augmenting data...")
df['text'] = df['text'].apply(lambda x: augment_query(x, model, tokenizer, config, device=device))

# Save the augmented data to a new CSV file
df.to_csv(augmented_path, index=False)

print(f"Data augmentation completed and saved to {augmented_path}")

Data loaded from data/train_2024.csv
Augmenting data...
Data augmentation completed and saved to augmented_data/train_2024.csv


In [4]:
original_path = "data/test_2024.csv"
augmented_path = "augmented_data/test_2024.csv"

# Load the original CSV file
df = pd.read_csv(original_path, quoting=3) # test for the first 1000 first
print(f"Data loaded from {original_path}")
# print(df)

# Apply the augmentation function to the 'text' column
print("Augmenting data...")
df['text'] = df['text'].apply(lambda x: augment_query(x, model, tokenizer, config, device=device))

# Save the augmented data to a new CSV file
df.to_csv(augmented_path, index=False)

print(f"Data augmentation completed and saved to {augmented_path}")

Data loaded from data/test_2024.csv
Augmenting data...
Data augmentation completed and saved to augmented_data/test_2024.csv


In [5]:
original_path = "data/dev_2024.csv"
augmented_path = "augmented_data/dev_2024.csv"

# Load the original CSV file
df = pd.read_csv(original_path, quoting=3) # test for the first 1000 first
print(f"Data loaded from {original_path}")
# print(df)

# Apply the augmentation function to the 'text' column
print("Augmenting data...")
df['text'] = df['text'].apply(lambda x: augment_query(x, model, tokenizer, config, device=device))

# Save the augmented data to a new CSV file
df.to_csv(augmented_path, index=False)

print(f"Data augmentation completed and saved to {augmented_path}")

Data loaded from data/dev_2024.csv
Augmenting data...
