# SLC - Sentence Level Classification

In [15]:
#!pip install optuna

In [16]:
#!pip install datasets

In [1]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

In [None]:
chkp = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(chkp, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(chkp)

In [2]:
chkp = "roberta_propaganda_spans"
model = AutoModelForSequenceClassification.from_pretrained(chkp, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(chkp)

In [2]:
chkp = "../TAPT-n/models/PTC_TAPT_n_RoBERTa"
model = AutoModelForSequenceClassification.from_pretrained(chkp, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(chkp)

Some weights of the model checkpoint at ../TAPT-n/models/PTC_TAPT_n_RoBERTa were not used when initializing RobertaForSequenceClassification: ['roberta.encoder.Ngram_layer.0.output.dense.bias', 'roberta.Ngram_embeddings.token_type_embeddings.weight', 'lm_head.layer_norm.weight', 'roberta.encoder.Ngram_layer.0.attention.output.LayerNorm.weight', 'roberta.Ngram_embeddings.word_embeddings.weight', 'roberta.encoder.Ngram_layer.0.output.LayerNorm.bias', 'roberta.encoder.Ngram_layer.0.attention.self.query.weight', 'roberta.encoder.Ngram_layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.Ngram_layer.0.attention.self.value.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'roberta.encoder.Ngram_layer.0.output.dense.weight', 'roberta.encoder.Ngram_layer.0.attention.output.dense.bias', 'lm_head.bias', 'roberta.encoder.Ngram_layer.0.attention.self.value.bias', 'roberta.encoder.Ngram_layer.0.attention.self.key.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.Ngra

In [3]:
dataset = load_dataset('Kyleiwaniec/SemEval_2020_Task_11', use_auth_token='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')

Using custom data configuration Kyleiwaniec--SemEval_2020_Task_11-1cdfe258e64d73f3
Reusing dataset parquet (/home/khamilton/.cache/huggingface/datasets/Kyleiwaniec___parquet/Kyleiwaniec--SemEval_2020_Task_11-1cdfe258e64d73f3/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [5]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['article_id', 'text', 'technique_classification', 'offsets'])




  0%|          | 0/17 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [6]:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_dataset["validation"].shuffle(seed=42).select(range(1000))

In [7]:
dataset['train'][2]

{'article_id': 'article111111111',
 'text': '"The next transmission could be more pronounced or stronger," WHO Director-General Tedros Adhanom Ghebreyesus told reporters in Geneva, insisting that "the issue is serious."',
 'technique_classification': [0],
 'offsets': [[1, 59]],
 'labels': 1}

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
from datasets import load_metric
import numpy as np
#,"matthews_correlation","f1","precision","recall"
metrics = load_metric("f1","matthews_correlation")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return  metrics.compute(predictions=predictions, references=labels)

In [None]:
#no_cuda=True
training_args = TrainingArguments(
    output_dir=chkp+"_SLC/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=48,
    per_device_eval_batch_size=48,
    num_train_epochs=3,
    weight_decay=0.01,
    no_cuda=True
)

# compute_metrics=compute_metrics,
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 63


Epoch,Training Loss,Validation Loss


# Evaluation

In [4]:
from transformers import pipeline

chkp=chkp+"_SLC/"

tokenizer = AutoTokenizer.from_pretrained(chkp)
classifier = pipeline("text-classification", model=chkp, tokenizer=tokenizer)

In [5]:
dataset['test'][0]

{'article_id': 'article813452859',
 'text': 'EU Profits From Trading With UK While London Loses Money – Political Campaigner',
 'technique_classification': [],
 'offsets': [],
 'labels': 0}

In [6]:
predictions=[]
for i in dataset['test']:
    pred = classifier(i['text'])[0]['label']
    pred = int(pred[-1])
    y = i['labels']
    predictions.append([pred,y])

In [7]:
len(predictions)

3215

In [8]:
acc = 0
TP, TN, FP, FN = 0,0,0,0
for p in predictions:
    if p[0] == p[1]: acc+=1
    if p[0] == 1 and p[1] == 1:
        TP+=1
    if p[0] == 0 and p[1] == 0:
        TN+=1
    if p[0] == 1 and p[1] == 0:
        FP+=1
    if p[0] == 0 and p[1] == 1:
        FN+=1
    
print(TP, TN, FP, FN) 
print(acc/len(predictions))

540 1918 192 565
0.7645412130637637


In [9]:
import math
EPS = 1e-17

In [10]:
precision = TP/(TP+FP+EPS)
recall = TP/(TP+FN+EPS)
F1 = (2*precision*recall)/(precision+recall)
MCC = (TP*TN-FP*FN)/(math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))+EPS)

In [11]:
print(precision,recall,F1,MCC)

0.7377049180327869 0.48868778280542985 0.5879150789330431 0.45042853055771964


In [154]:
# All propagnda. We are going to pretend that all predictions are 1
# This matches the baseline from the paper.
TP, TN, FP, FN = 0,0,0,0
for p in predictions:
    if p[0] == 1 and p[1] == 1:
        TP+=1
    if p[0] == 0 and p[1] == 0:
        FP+=1
    if p[0] == 1 and p[1] == 0:
        FP+=1
    if p[0] == 0 and p[1] == 1:
        TP+=1
    
print(TP, TN, FP, FN) 
precision = TP/(TP+FP+EPS)
recall = TP/(TP+FN+EPS)
F1 = (2*precision*recall)/(precision+recall)
MCC = (TP*TN-FP*FN)/(math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))+EPS)
print(precision,recall,F1,MCC)

1105 0 2110 0
0.343701399688958 1.0 0.5115740740740741 0.0
