# Import libraries

In [None]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
import torch


In [None]:
from torch import cuda
from torch.utils.data import Dataset

device = 'cuda' if cuda.is_available() else 'cpu'


In [89]:
# Function tp shuffle the rows of a dataframe
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:

  for i in range(cycles):

    new_df = old_df.sample(frac=1).reset_index(drop=True)

  return new_df
 

# Read data

In [97]:
df_propaganda = pd.read_csv('final.csv')
df_propaganda = shuffle_df(df_propaganda, 1)
df_propaganda = df_propaganda.sample(n=3000)

# No propaganda Facts
df_nopropaganda_facts = pd.read_csv('no_propaganda.csv')
df_nopropaganda_facts['Technique'] = 'no_propaganda'


df_nopropaganda_articles = pd.read_csv('cleaned_no_propaganda_from_articles_sentences.csv')

# Change nothing values in column Technique to no_propaganda
df_nopropaganda_articles['Technique'] = 'no_propaganda'

# Get 500 random samples from the no_propaganda_articles
df_nopropaganda_articles_500 = shuffle_df(df_nopropaganda_articles, 1)
df_nopropaganda_articles_500 = df_nopropaganda_articles_500.sample(n=500)
df_nopropaganda_articles_500

Unnamed: 0,Sentence,Technique
679,the penalty of deathIt continues to be,no_propaganda
5063,if perhaps a bit intense and have been made i...,no_propaganda
3980,highlighted Lews ethnicity as a reason for his...,no_propaganda
2802,WHO Prepares For Worst Case As Congo Ebola Out...,no_propaganda
1776,New Audio From The Night Of The Las Vegas Mass...,no_propaganda
...,...,...
1927,is being ignored by the mainstream media and w...,no_propaganda
4206,Vatican clarified the Churchs teaching on Octo...,no_propaganda
844,DSouza was sentenced in 2014 to five years of ...,no_propaganda
3660,is the distinction between sin a culpable act ...,no_propaganda


In [98]:
# Merege the two dataframes
df = pd.concat([df_propaganda, df_nopropaganda_articles_500, df_nopropaganda_facts])
df['Technique'].value_counts()

Technique
Loaded_Language    1915
no_propaganda       900
Doubt               449
Hyperbole           419
Jingoism            217
Name: count, dtype: int64

In [99]:
# For BertForSequenceClassification model as well I need these exact mapping of id2labels and labels2id in dictionary form.
labels = df['Technique'].unique().tolist()
labels = [s.strip() for s in labels ]
for key, value in enumerate(labels):
    print(value)
NUM_LABELS= len(labels)

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}

Hyperbole
Loaded_Language
Doubt
Jingoism
no_propaganda


In [100]:
# Create a new column to represent the categories in numerical form
df["labels"]=df.Technique.map(lambda x: label2id[x.strip()])

# Prepare and Train the model

In [101]:
# Fine tuning dependancies
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english",ignore_mismatched_sizes=True ,num_labels = NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)


In [103]:
SIZE= df.shape[0]

train_texts= list(df.Sentence[:SIZE//2])

val_texts=   list(df.Sentence[SIZE//2:(3*SIZE)//4 ])

test_texts=  list(df.Sentence[(3*SIZE)//4:])

train_labels= list(df.labels[:SIZE//2])

val_labels=   list(df.labels[SIZE//2:(3*SIZE)//4])

test_labels=  list(df.labels[(3*SIZE)//4:])

In [104]:
len(train_texts), len(val_texts), len(test_texts)


(1950, 975, 975)

In [105]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [106]:
class DataLoader(Dataset):
    """
    Custom Dataset class for handling tokenized text data and corresponding labels.
    Inherits from torch.utils.data.Dataset.
    """
    def __init__(self, encodings, labels):
        """
        Initializes the DataLoader class with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized input text data
                              (e.g., 'input_ids', 'token_type_ids', 'attention_mask').
            labels (list): A list of integer labels for the input text data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a dictionary containing tokenized data and the corresponding label for a given index.

        Args:
            idx (int): The index of the data item to retrieve.

        Returns:
            item (dict): A dictionary containing the tokenized data and the corresponding label.
        """
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Returns the number of data items in the dataset.

        Returns:
            (int): The number of data items in the dataset.
        """
        return len(self.labels)

In [107]:
train_dataloader = DataLoader(train_encodings, train_labels)

val_dataloader = DataLoader(val_encodings, val_labels)

test_dataset = DataLoader(test_encodings, test_labels)

In [109]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.

    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of
              that observation belonging to a certain class.

    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [113]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='./TTC4900Modelv4',
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    # Number of steps used for a linear warmup
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
   # TensorBoard log directory
    logging_dir='./multi-class-logs-4',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    load_best_model_at_end=True
)

In [114]:
trainer = Trainer(
    # the pre-trained model that will be fine-tuned 
    model=model,
     # training arguments that we defined above                        
    args=training_args,                 
    train_dataset=train_dataloader,         
    eval_dataset=val_dataloader,            
    compute_metrics= compute_metrics
)

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [115]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,1.0181,0.622388,0.779487,0.604429,0.71323,0.634396
200,0.5272,0.497771,0.833846,0.764917,0.781853,0.765496
300,0.3374,0.51542,0.830769,0.779619,0.781576,0.784985
400,0.2011,0.628574,0.838974,0.767092,0.792803,0.757593
500,0.1102,0.720403,0.826667,0.766206,0.762941,0.777716
600,0.0707,0.774774,0.815385,0.762475,0.755151,0.773486
700,0.0428,0.821674,0.831795,0.769925,0.77804,0.767381
800,0.0298,0.858996,0.834872,0.781003,0.78369,0.782514
900,0.031,0.884287,0.835897,0.777939,0.783857,0.776024
1000,0.0254,0.927999,0.822564,0.776381,0.766221,0.792887


TrainOutput(global_step=1220, training_loss=0.19909898011838315, metrics={'train_runtime': 10490.6453, 'train_samples_per_second': 1.859, 'train_steps_per_second': 0.116, 'total_flos': 575177308290000.0, 'train_loss': 0.19909898011838315, 'epoch': 10.0})

In [117]:
def predict(text):
    """
    Predicts the class label for a given input text

    Args:
        text (str): The input text for which the class label needs to be predicted.

    Returns:
        probs (torch.Tensor): Class probabilities for the input text.
        pred_label_idx (torch.Tensor): The index of the predicted class label.
        pred_label (str): The predicted class label.
    """
    # Tokenize the input text and move tensors to the GPU if available
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Get model output (logits)
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    """ Explanation outputs: The BERT model returns a tuple containing the output logits (and possibly other elements depending on the model configuration). In this case, the output logits are the first element in the tuple, which is why we access it using outputs[0].

    outputs[0]: This is a tensor containing the raw output logits for each class. The shape of the tensor is (batch_size, num_classes) where batch_size is the number of input samples (in this case, 1, as we are predicting for a single input text) and num_classes is the number of target classes.

    softmax(1): The softmax function is applied along dimension 1 (the class dimension) to convert the raw logits into class probabilities. Softmax normalizes the logits so that they sum to 1, making them interpretable as probabilities. """

    # Get the index of the class with the highest probability
    # argmax() finds the index of the maximum value in the tensor along a specified dimension.
    # By default, if no dimension is specified, it returns the index of the maximum value in the flattened tensor.
    pred_label_idx = probs.argmax()

    # Now map the predicted class index to the actual class label 
    # Since pred_label_idx is a tensor containing a single value (the predicted class index), 
    # the .item() method is used to extract the value as a scalar
    pred_label = model.config.id2label[pred_label_idx.item()]
    # a datafame to show the probabilities of each class
    x = pd.DataFrame(probs.detach().numpy(), columns = model.config.id2label.values())
    #print(x)
    # for key, value in enumerate(probs[0]):
    #     print(f"{model.config.id2label[key]}: {value.item()*100:.2f}%")
    x["predicted"] = model.config.id2label[pred_label_idx.item()]
    x['text'] = text
    # move these columns to the front
    x = x[["text", "predicted"] + [c for c in x.columns if c not in ["text", "predicted"]]]
    return x

# Save the model

In [116]:
model_path = "bert-prop-text-classification-model-v4"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('bert-prop-text-classification-model-v4/tokenizer_config.json',
 'bert-prop-text-classification-model-v4/special_tokens_map.json',
 'bert-prop-text-classification-model-v4/vocab.txt',
 'bert-prop-text-classification-model-v4/added_tokens.json')

# Test the model

In [119]:
# Create a data for testing
data = {
    "Sentence": [
        "Is our competitor really committed to the environment, or is it just a marketing tactic?",
        "Can we trust the results of a study funded by pharmaceutical companies?",
        "How sure are we that the government's policy will benefit us in the long run?",
        "Are the media reports about the economic recovery truly accurate?",
        "Do the so-called experts really have all the facts?",
        "Is it possible that we've been misled about the benefits of the new tax law?",
        "What if the scientific consensus on climate change is wrong?",
        "Could there be a hidden agenda behind the sudden push for this legislation?",
        "Are consumer safety standards as rigorous as they claim?",
        "Is there more to the story than what the officials are telling us?",
        "The senator embraced a radical, socialist agenda.",
        "Victims of corporate greed are struggling more than ever.",
        "The heroic firefighters battled the blaze.",
        "The unpatriotic few voted against the bill.",
        "This miracle drug can save lives!",
        "That disastrous policy has ruined our economy.",
        "The government's scheme will strip us of our rights.",
        "Their ruthless CEO cares nothing for the average worker.",
        "This genocide must be stopped at all costs.",
        "The enlightened professor shared her wisdom with the class.",
        "Our country leads the world in freedom and justice.",
        "Only we have the moral clarity to bring peace to this region.",
        "Support our troops, support our nation — the true land of the brave and free.",
        "Real patriots buy only domestically produced products.",
        "Our national athletes are the pride of our country and superior to all others.",
        "We must defend our borders from foreign influences that threaten our way of life.",
        "Our language and culture are what make this nation great.",
        "Foreign policies should always prioritize our national interests over global concerns.",
        "It is our duty to spread our values around the world.",
        "Those who question our military's actions are betraying our country.",
        "This is the worst disaster in human history.",
        "If this law passes, it will destroy our community.",
        "He's the best player the world has ever seen.",
        "Without this product, you're missing out on life.",
        "They're going to turn the city into a warzone.",
        "This is a life-changing opportunity — don't miss out!",
        "This car can outperform any other vehicle on the planet.",
        "I've told you a million times already.",
        "This event will make or break our company's future.",
        "No one works harder than she does.",
        "The library closes at 8 PM on weekdays.",
        "Water boils at 100 degrees Celsius at sea level.",
        "The cat is sleeping on the sofa.",
        "She travels to work by train every morning.",
        "We need to buy more coffee for the office.",
        "The meeting is scheduled for 10 AM tomorrow.",
        "He prefers to wear plain t-shirts.",
        "The restaurant serves breakfast until 11 AM.",
        "It rained last night, but today is sunny.",
        "The museum features an exhibit on ancient Rome."
    ],
    "Label": [
        "Doubt", "Doubt", "Doubt", "Doubt", "Doubt", "Doubt", "Doubt", "Doubt", "Doubt", "Doubt",
        "Loaded Language", "Loaded Language", "Loaded Language", "Loaded Language", "Loaded Language",
        "Loaded Language", "Loaded Language", "Loaded Language", "Loaded Language", "Loaded Language",
        "Jingoism", "Jingoism", "Jingoism", "Jingoism", "Jingoism", "Jingoism", "Jingoism", "Jingoism",
        "Jingoism", "Jingoism",
        "Hyperbole", "Hyperbole", "Hyperbole", "Hyperbole", "Hyperbole", "Hyperbole", "Hyperbole",
        "Hyperbole", "Hyperbole", "Hyperbole",
        "No Propaganda", "No Propaganda", "No Propaganda", "No Propaganda", "No Propaganda", "No Propaganda",
        "No Propaganda", "No Propaganda", "No Propaganda", "No Propaganda"
    ]
}

# Create DataFrame
df_test = pd.DataFrame(data)

# Display the DataFrame
df_test


Unnamed: 0,Sentence,Label
0,Is our competitor really committed to the envi...,Doubt
1,Can we trust the results of a study funded by ...,Doubt
2,How sure are we that the government's policy w...,Doubt
3,Are the media reports about the economic recov...,Doubt
4,Do the so-called experts really have all the f...,Doubt
5,Is it possible that we've been misled about th...,Doubt
6,What if the scientific consensus on climate ch...,Doubt
7,Could there be a hidden agenda behind the sudd...,Doubt
8,Are consumer safety standards as rigorous as t...,Doubt
9,Is there more to the story than what the offic...,Doubt


In [120]:
df_test["predicted"] = df_test["Sentence"].apply(lambda x: predict(x)["predicted"])
df_test

Unnamed: 0,Sentence,Label,predicted
0,Is our competitor really committed to the envi...,Doubt,Doubt
1,Can we trust the results of a study funded by ...,Doubt,Doubt
2,How sure are we that the government's policy w...,Doubt,Doubt
3,Are the media reports about the economic recov...,Doubt,Doubt
4,Do the so-called experts really have all the f...,Doubt,Doubt
5,Is it possible that we've been misled about th...,Doubt,Doubt
6,What if the scientific consensus on climate ch...,Doubt,Doubt
7,Could there be a hidden agenda behind the sudd...,Doubt,Doubt
8,Are consumer safety standards as rigorous as t...,Doubt,Doubt
9,Is there more to the story than what the offic...,Doubt,Doubt
