In [None]:
pip install datasets

In [None]:
pip install accelerate -U

In [1]:
import pandas as pd
import torch
from datasets import Dataset, load_metric
import random

In [2]:
data = pd.read_csv('Project Dataset.csv')

In [3]:
data

Unnamed: 0,Sentence,Technique
0,the nation that gave the world the Magna Carta...,Loaded_Language
1,a striking blow against freedom,Loaded_Language
2,a complete travesty of justice,Loaded_Language
3,firestorm of outrage,Loaded_Language
4,"aggressively stuck his tongue in my mouth""",Loaded_Language
...,...,...
498,our president,Jingoism
499,our nation's history,Jingoism
500,Americans,Jingoism
501,protect America,Jingoism


## Reading Neutral Data

In [4]:
neutral_data = pd.read_csv('neutral_sentences.csv')

In [5]:
neutral_data

Unnamed: 0,Sentence,Technique
0,Water boils at 100 degrees Celsius.,Nothing
1,The Earth revolves around the Sun.,Nothing
2,"The summit of Mount Everest is about 8,848 met...",Nothing
3,The Pacific Ocean is the largest ocean on Earth.,Nothing
4,Photosynthesis is the process by which green p...,Nothing
...,...,...
395,Some species of starfish can regenerate lost l...,Nothing
396,The world's first artificial heart transplant ...,Nothing
397,The population of the Earth's data is constant...,Nothing
398,Some species of spiders can spin webs that are...,Nothing


In [6]:
data['Technique'].value_counts()

Technique
Hyperbole          101
Doubt              101
no_propaganda      101
Loaded_Language    100
Jingoism           100
Name: count, dtype: int64

## Code to shuffle the data

In [5]:
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
  for i in range(cycles):
    new_df = old_df.sample(frac=1).reset_index(drop=True)
  return new_df

In [8]:
neutral_data['Technique'] = neutral_data['Technique'].replace('Nothing', 'no_propaganda')

In [9]:
neutral_data

Unnamed: 0,Sentence,Technique
0,Water boils at 100 degrees Celsius.,no_propaganda
1,The Earth revolves around the Sun.,no_propaganda
2,"The summit of Mount Everest is about 8,848 met...",no_propaganda
3,The Pacific Ocean is the largest ocean on Earth.,no_propaganda
4,Photosynthesis is the process by which green p...,no_propaganda
...,...,...
395,Some species of starfish can regenerate lost l...,no_propaganda
396,The world's first artificial heart transplant ...,no_propaganda
397,The population of the Earth's data is constant...,no_propaganda
398,Some species of spiders can spin webs that are...,no_propaganda


## Concatenating manually labeled data with Facts/Neutral statements

In [6]:
data = pd.concat([data, neutral_data[:100]], ignore_index = True)

data = shuffle_df(data, 1)

In [7]:
data

Unnamed: 0,Sentence,Technique
0,local and federal authorities are refusing to ...,Doubt
1,an extraordinary public service,Hyperbole
2,lied to the country,Jingoism
3,inexhaustible mercy and forgiveness,Hyperbole
4,a complete travesty of justice,Loaded_Language
...,...,...
498,American people,Jingoism
499,That the neo Catholic establishment refuses to...,no_propaganda
500,our,Jingoism
501,for normalizing a regime routinely called out ...,no_propaganda


## Divide into train_test split

In [8]:
#Split the data into training and test set

train_data = data[:int(len(data)*0.8)]
test_data = data[int(len(data)*0.8):]

In [9]:
train_data

Unnamed: 0,Sentence,Technique
0,local and federal authorities are refusing to ...,Doubt
1,an extraordinary public service,Hyperbole
2,lied to the country,Jingoism
3,inexhaustible mercy and forgiveness,Hyperbole
4,a complete travesty of justice,Loaded_Language
...,...,...
397,Whether the Trump administration follows throu...,Doubt
398,I thought my life was over,Hyperbole
399,a chronic confusion seems to mark your pontifi...,Loaded_Language
400,American blood,Jingoism


In [10]:
test_data

Unnamed: 0,Sentence,Technique
402,a natural disaster of a magnitude not seen in ...,Hyperbole
403,"watershed moment in U.S. and world history, an...",Hyperbole
404,"And if so, could we have been this wrong",Doubt
405,"At some point, the American people will be for...",Jingoism
406,the most unprecedented persecution,Hyperbole
...,...,...
498,American people,Jingoism
499,That the neo Catholic establishment refuses to...,no_propaganda
500,our,Jingoism
501,for normalizing a regime routinely called out ...,no_propaganda


In [11]:
train_ds = Dataset.from_pandas(train_data, split="train")
test_ds = Dataset.from_pandas(test_data, split="test")

In [12]:
# label_to_int = ['Name_Calling,Labeling', 'Nothing', 'Loaded_Language','Repetition','Doubt','Exaggeration,Minimisation','Flag-Waving','Causal_Oversimplification']

#Flag-Waving - Jingoism
#Exaageration and Minimization - Hyperbole
#Causal-Simplification - Simplification
#Name_Calling, Labelling - NameCalling
label_to_int = ['Loaded_Language','Doubt','Hyperbole','Jingoism', 'no_propaganda']
template = "This example is {}."

In [13]:
from transformers import BartTokenizerFast

tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')

In [None]:
def create_input_sequence(sample):
  text = sample["Sentence"]
  label = sample["Technique"][0]
  contradiction_label = random.choice([x for x in label_to_int if x != label])
  encoded_sequence = tokenizer(text * 2, [template.format(label), template.format(contradiction_label)], truncation = True, padding = 'max_length')
  encoded_sequence["labels"] = [2, 0]
  encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
  return encoded_sequence


train_dataset = train_ds.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["Sentence", "Technique"])
test_dataset = test_ds.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["Sentence", "Technique"])

In [15]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import numpy as np

In [16]:
def compute_metrics(p: EvalPrediction):
  metric_acc = load_metric("accuracy")
  metric_f1 = load_metric("f1")
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  preds = np.argmax(preds, axis = 1)
  result = {}
  result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
  result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
  return result

In [17]:
model_directory = "Result"

In [18]:
training_args = TrainingArguments(
  output_dir = model_directory,      # Output directory
  num_train_epochs = 5,             # Total number of training epochs
  per_device_train_batch_size = 16,  # Batch size per device during training
  per_device_eval_batch_size = 64,   # Batch size for evaluation
  warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
  weight_decay = 0.01,               # Strength of weight decay
  logging_strategy='steps',
  logging_steps=100,
  evaluation_strategy="steps",
  eval_steps=100,
  save_strategy="steps", 
)


model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels = len(label_to_int), ignore_mismatched_sizes = True)

trainer = Trainer(
  model = model,                     # The instantiated model to be trained
  args = training_args,              # Training arguments, defined above
  compute_metrics = compute_metrics, # A function to compute the metrics
  train_dataset = train_dataset,     # Training dataset
  eval_dataset = test_dataset,       # Evaluation dataset
  tokenizer = tokenizer              # The tokenizer that was used
)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([5, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [24]:
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5946733951568604,
 'eval_accuracy': 0.8512396694214877,
 'eval_f1': 0.8509852216748768,
 'eval_runtime': 208.0203,
 'eval_samples_per_second': 1.163,
 'eval_steps_per_second': 0.019,
 'epoch': 4.0}

In [27]:
from transformers import pipeline

# torch.cuda.empty_cache()
classifier = pipeline("zero-shot-classification", model = model, tokenizer = tokenizer)

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [None]:
# sequences = "it seems that perhaps even his legendary"
Sentence = ["Is our competitor really committed to the environment, or is it just a marketing tactic?"]

count=0
for sequences in Sentence:
    label_to_int=['Loaded_Language','Doubt','Hyperbole','Jingoism', 'no_propaganda']

    print(classifier(sequences, label_to_int))

Jingoism
Jingoism
Jingoism
Loaded_Language
Hyperbole
Loaded_Language
Jingoism
Loaded_Language
Jingoism
Loaded_Language
10 Completed
Doubt
Loaded_Language
Doubt
Loaded_Language
Doubt
Doubt
Doubt
Doubt
Doubt
Doubt
10 Completed
Doubt
Loaded_Language
Doubt
Doubt
Doubt
Doubt
Doubt
Loaded_Language
Doubt
Loaded_Language
10 Completed
Doubt
Loaded_Language
Doubt
Loaded_Language
Doubt
Doubt
Doubt
Loaded_Language
Doubt
Doubt
10 Completed
Doubt
Doubt
Doubt
Doubt
Doubt
Doubt


In [25]:
model_path="model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Non-default generation parameters: {'forced_eos_token_id': 2}


('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.json',
 'model/merges.txt',
 'model/added_tokens.json',
 'model/tokenizer.json')

In [5]:
model = BartForSequenceClassification.from_pretrained('model')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1', '2': 'LABEL_2', '3': 'LABEL_3', '4': 'LABEL_4'}. The number of labels wil be overwritten to 5.
