In [None]:
from simpletransformers.classification import ClassificationModel
from urllib import request
import numpy as np
import pandas as pd
import logging
import random
import torch

In [None]:
train_df = pd.read_csv("data/train_set.csv")
test_df = pd.read_csv("data/dev_set.csv")

# Baselines

## Bag-of-Words

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:



# Replace NaN values with empty strings
train_df["text"] = train_df["text"].fillna("")
test_df["text"] = test_df["text"].fillna("")


# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words("english")]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing
train_df["processed_text"] = train_df["text"].apply(preprocess_text)
test_df["processed_text"] = test_df["text"].apply(preprocess_text)



In [None]:
# Shuffle rows
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

X_train = train_df["processed_text"]
y_train = train_df["label"]

X_test = test_df["processed_text"]
y_test = test_df["label"]

y_train.value_counts()

In [None]:
# Create Bag-of-Words features
vectorizer = CountVectorizer(binary=False)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

print("Vocabulary Size:", len(vectorizer.get_feature_names_out()))

In [None]:
# Train Logistic Regression Model
clf = LogisticRegression()
clf.fit(X_train_bow, y_train)

# Make predictions
y_pred = clf.predict(X_test_bow)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


## DeepSeek v3

In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import os


class DeepSeekApi:
  def __init__(self):
    # Obtain API key
    load_dotenv()
    self.api_key = os.getenv("DEEPSEEK_API_KEY")

  # Function to call DeepSeek API to rephrase text
  def rephrase(self, text):
    prompt = f"rephrase: {text}"
    return self._call_api(prompt)
  
  # Function to call DeepSeek API to classify if text is PCL or not (baseline)
  def pcl_classify(self, text):
    prompt = f"Accoording to the paper \"Don’t Patronize Me! An Annotated Dataset with Patronizing and Condescending Language towards Vulnerable Communities\", please classify this sentence \"{text}\" on wheter it is considered a Patronizing and Condescendig Language (PCL) or not. Just reply me with either \"True\" if you think this is ccnsiderd PCL, or \"False\" if you think otherwise."
    return self._call_api(prompt)

  # Function to call DeepSeek API to classify if text is PCL or not in batches (baseline)
  def batch_pcl_classify(self, texts, batch_size):
    prompts = []
    for text in texts:
      prompt = f"Accoording to the paper \"Don’t Patronize Me! An Annotated Dataset with Patronizing and Condescending Language towards Vulnerable Communities\", please classify this sentence \"{text}\" on wheter it is considered a Patronizing and Condescendig Language (PCL) or not. Just reply me with either \"True\" if you think this is ccnsiderd PCL, or \"False\" if you think otherwise."
      prompts.append(prompt)
    return self._batch_call_api(prompts, batch_size)
  
  def _call_api(self, prompt):
    deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")

    client = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com")

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "user", "content": prompt},
        ],
        stream=False
    )

    return response.choices[0].message.content

  def _batch_call_api(self, prompts, batch_size):
    deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")

    client = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com")
    
    messages = []
    for prompt in prompts:
      messages.append({"role": "user", "content": prompt})

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
        stream=False
    )
    
    logging.info(f"Original: {response.choices[0].message.content}")
    predictions = response.choices[0].message.content.split(' \n')
    logging.info(f"After split: {predictions}")
    predictions = list(map(lambda s: "".join(s.split()), predictions))
    logging.info(f"After remove space: {predictions}")
    if '.' in predictions[0]:
      predictions = list(map(lambda s: "".join(s.split('.')[1:]), predictions))
      logging.info(f"After remove dot: {predictions}")
    # Filter out the empty strings
    predictions = [s for s in predictions if s != '']
    logging.info(f"After remove empty str: {predictions}")
    
    logging.info(predictions)
    
    if len(predictions) != batch_size:
      raise ValueError("Error processing predictions from DeepSeek response.")

    return predictions

In [None]:
MAX_NUM_PARAPHRASES = 6
BATCH_SIZE = 1
LOG_FILEPATH = 'deepseek_api_logs/deepseek_dev_baseline.log'
SAVE_CSV_FILEPATH = 'data/deepseek_dev_baseline.csv'

def add_predictions_to_new_df(predictions, index, old_df, new_df):
    for i in range(index,  min(index + BATCH_SIZE, len(old_df))):
        entry = old_df.iloc[[i], [4]].copy()
        entry.at[entry.index[0], 'prediction'] = predictions[i - index]
        new_df = pd.concat([new_df, entry], ignore_index=True)
    return new_df

def calc_f1(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels)
    return classification_report(labels, preds, output_dict=True)["1"]["f1-score"]

In [None]:

# Get the training dataset
dev_df = pd.read_csv("../data/dev_set.tsv", delimiter="\t")
dev_df = pd.DataFrame(dev_df).reset_index(drop=True)

# Initialise a new empty dataframe
new_df = pd.DataFrame()

# Initialise DeepSeek api
deepseek = DeepSeekApi()

# Set up logging to a file
logging.basicConfig(filename=LOG_FILEPATH, level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

index = 0
while index < len(dev_df):
    try:
        logging.info(f"Processing from index {index}.")
        texts = []
        for i in range(index, min(index + BATCH_SIZE, len(dev_df))):
            text = dev_df.iloc[i, 4]
            texts.append(text)
            logging.info(f"Sentence: {text}")

        predictions = deepseek.batch_pcl_classify(texts, min(BATCH_SIZE, len(dev_df) - index))
        logging.info(f"Predictions: {predictions}")
        # Append a new entry to new DataFrame
        new_df = add_predictions_to_new_df(predictions, index, dev_df, new_df)
        # Save the new dataframe
        new_df.to_csv(SAVE_CSV_FILEPATH, index=False, encoding="utf-8")
        # Increment index by batch size
        index += BATCH_SIZE
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        index += BATCH_SIZE

# Read the new csv file that contains the model's predictions
preds_df = pd.read_csv(SAVE_CSV_FILEPATH)
preds_df = pd.DataFrame(preds_df).reset_index(drop=True)

# Get the predictions
model_preds = preds_df["prediction"]
# Get the original labels
labels  = preds_df["label"]
# Calculate F1 score
f1_score = calc_f1(model_preds, labels)
print(f"The F1 score for DeepSeek v3 PCL binary classifaction: {f1_score}")

# Paraphrase training data

In [None]:
from parrot import Parrot


class ParrotParaphraser:
    def __init__(self):
        self.model = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

    # Function to paraphrase a sentence using Parrot pretrained model
    def paraphrase(self, sentence):
        return self.model.augment(input_phrase=sentence)

In [None]:
MAX_NUM_PARAPHRASES = 6

def add_text_entry_to_new_df(sentence, index, old_df, new_df):
    print("Adding text of index", index)
    entry = old_df.iloc[[index]].copy()
    print("Entry:", entry)
    entry.at[entry.index[0], 'text'] = sentence
    return pd.concat([new_df, entry], ignore_index=True)

In [None]:
# Initialise a new empty dataframe
new_df = pd.DataFrame(columns=train_df.columns)

# Initialise Parrot paraphrasing model
parrot_paraphraser = ParrotParaphraser()

# Filter and get only the entries that are labeled as PCL
pcl_df = train_df[train_df['label'] == 1].reset_index(drop=True)

for index, _ in pcl_df.iterrows():
    original_text = pcl_df.at[index, "text"]
    paraphrases = [original_text]
    output = parrot_paraphraser.paraphrase(original_text)
    # Add the original entry to the new dataframe
    new_df = add_text_entry_to_new_df(original_text, index, pcl_df, new_df)
    # If sentence cannot be paraphrased, skip
    if not output: 
        continue
    for paraphrase in output:
        if len(paraphrases) >= MAX_NUM_PARAPHRASES:
            break
        if paraphrase not in paraphrases:
            paraphrases.append(paraphrase)
            # Append a new entry to new DataFrame
            new_df = add_text_entry_to_new_df(paraphrase[0], index, pcl_df, new_df)

# Add all original non pcl entries to the new dataframe
non_pcl_df = train_df[train_df['label'] == 0]
new_df = pd.concat([new_df, non_pcl_df], ignore_index=True)

# Save the new dataframe
new_df.to_csv("data/train_set_cleaned_paraphrase_upsampled.csv", index=False, encoding="utf-8")

new_df

# Model training

In [None]:
def f1(tp, fp, fn):
    if (tp + 0.5 * (fp + fn)) == 0:
        return 0
    return tp / (tp + 0.5 * (fp + fn))

def augment_text(row, deletion_prob=0.05, swap_prob=0.3):
    # Tokenize the text
    words = row['text'].split()
    
    for i in range(len(words)):
        if random.random() < swap_prob:
            swap_i = random.randint(0, len(words) - 1)
            if swap_i != i:  # Ensure not swapping with itself
                words[i], words[swap_i] = words[swap_i], words[i]
    
    # Apply deletion
    words = [word for word in words if random.random() >= deletion_prob]
    
    # Reconstruct the augmented text
    augmented_text = ' '.join(words)
    return augmented_text

def train_deberta(learning_rate, batch_size, num_epochs, weight_decay, dropout, num_layers_unfrozen, augment_warmup_epochs,
                  train_file='data/train_paraphrase_upsampled.csv', test_file='data/dev_set.csv',
                  save_path=None, test_results_path='dev.txt'):

    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

    # Model configuration with hyperparameters
    model_args = {
        "num_train_epochs": 1,
        "train_batch_size": batch_size,
        "eval_batch_size": batch_size,
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "overwrite_output_dir": True,
        "save_best_model": False,
        "save_eval_checkpoints": False,
        "save_model_every_epoch": False,
        "use_early_stopping": False,
        "use_multiprocessing": False,
        "use_multiprocessing_for_evaluation": False,
        "reprocess_input_data": True,
        "save_steps": -1,
        "fp16": False,  # Ensure FP16 is disabled
        "dropout": dropout
    }

    # Initialize DeBERTa model
    model = ClassificationModel(
        "deberta",
        "microsoft/deberta-base",
        num_labels=2,
        args=model_args,
    )

    # Unfreeze the last `num_layers_unfrozen` layers + classifier head
    model_layers = list(model.model.deberta.encoder.layer)
    num_total_layers = len(model_layers)
    layers_to_unfreeze = min(num_layers_unfrozen, num_total_layers)

    for name, param in model.model.named_parameters():
        param.requires_grad = False  # Freeze everything first

    for i in range(num_total_layers - layers_to_unfreeze, num_total_layers):
        for param in model_layers[i].parameters():
            param.requires_grad = True  # Unfreeze selected layers

    # Ensure classifier head is always trainable
    for name, param in model.model.named_parameters():
        if "classifier" in name:
            param.requires_grad = True
    
    cols = ["text", "label"]
    
    # Train for the required number of epochs
    for epoch in range(num_epochs):
        _train_df = train_df.copy()

        if epoch >= augment_warmup_epochs:
            _train_df["text"] = _train_df.apply(lambda row: augment_text(row), axis=1)
        
        # Train the model
        model.train_model(_train_df[cols])
        
    print("Evaluating on dev set")
    preds, _ = model.predict(test_df[cols])

    with open(test_results_path, 'w+') as f:
        for pred in preds:
            f.write(pred + '\n')
    
    if save_path:
        print("Saving final model to", save_path)
        model.model.save_pretrained(save_path)
        model.tokenizer.save_pretrained(save_path)
        model.config.save_pretrained(save_path)

In [None]:
# Train with best hyperparameters
train_deberta(
    learning_rate=2e-5,
    batch_size=32,
    num_epochs=12,
    weight_decay=0.0,
    dropout=0.0,
    num_layers_unfrozen=3,
    augment_warmup_epochs=5,
    save_path='final_model',
    test_results_path='dev.txt'
)

# Evaluate final model