<a href="https://colab.research.google.com/github/someshsingh22/News-Propaganda-Detection/blob/master/BERT_LR_TC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
!git clone https://github.com/someshsingh22/News-Propaganda-Detection
!pip install transformers
%cd News-Propaganda-Detection

In [2]:
from transformers import *
import time
import os
import numpy as np
import pandas as pd
import re
import itertools
from tqdm import tqdm
from tqdm import  tqdm_notebook
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder as LE
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import glob
import os.path
import sys
import codecs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import random
torch.manual_seed(0)
random.seed(0)

Using TensorFlow backend.


In [0]:
#@title
def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    """
    Read articles from files matching patterns <file_pattern> from  
    the directory <folder_name>. 
    The content of the article is saved in the dictionary whose key
    is the id of the article (extracted from the file name).
    Each element of <sentence_list> is one line of the article.
    """
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read()
    return articles


def read_predictions_from_file(filename):
    """
    Reader for the gold file and the template output file. 
    Return values are four arrays with article ids, labels 
    (or ? in the case of a template file), begin of a fragment, 
    end of a fragment. 
    """
    articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
            articles_id.append(article_id)
            gold_labels.append(gold_label)
            span_starts.append(span_start)
            span_ends.append(span_end)
    return articles_id, span_starts, span_ends, gold_labels

def report(true, pred):
    cm=confusion_matrix(true, pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize = (10,8))
    sns.heatmap(cm,annot=True)
    cf_rep=classification_report(true,pred)
    print(cf_rep)

In [0]:
tfidf_c_config = {"min": 5, "ng_l": 1, "ng_h": 6, "max_features": 1500}

tfidf_w_config = {"min": 3, "ng_l": 1, "ng_h": 3, "max_features": 2000}


class FeatureExtraction:
    def __init__(
        self,
        train_data,
        dev_data,
        tfidf_c_config=tfidf_c_config,
        tfidf_w_config=tfidf_w_config,
    ):
        self.train_data, self.dev_data = train_data, dev_data
        self.tfidf_c = TfidfVectorizer(
            sublinear_tf=True,
            min_df=tfidf_c_config["min"],
            ngram_range=(tfidf_c_config["ng_l"], tfidf_c_config["ng_h"]),
            stop_words="english",
            analyzer="char",
            max_features=tfidf_c_config["max_features"],
            lowercase=train_data.lower,
        )
        self.tfidf_w = TfidfVectorizer(
            sublinear_tf=True,
            min_df=tfidf_w_config["min"],
            ngram_range=(tfidf_w_config["ng_l"], tfidf_w_config["ng_h"]),
            stop_words="english",
            analyzer="word",
            max_features=tfidf_w_config["max_features"],
            lowercase=dev_data.lower,
        )

    def get_features(self):
        sentences = self.train_data.sentences + self.dev_data.sentences
        spans = np.asarray(self.train_data.spans + self.dev_data.spans).reshape(-1, 1)
        sentences_c = self.tfidf_c.fit_transform(sentences)
        sentences_w = self.tfidf_w.fit_transform(sentences)
        sen_tc, sen_dc = (
            sentences_c[: self.train_data.size],
            sentences_c[self.train_data.size :],
        )
        sen_tw, sen_dw = (
            sentences_w[: self.train_data.size],
            sentences_w[self.train_data.size :],
        )
        span_t, span_d = spans[: self.train_data.size], spans[self.train_data.size :]
        self.train_features = hstack([sen_tc, sen_tw, span_t])
        self.dev_features = hstack([sen_dc, sen_dw, span_d])


In [0]:
class TransformerModel:
  def __init__(self, device=None, transformer=None, seed=1234):
    self.device=device
    self.train_loss_set = []
    self.predictions=[]

    if device is None:
      self.device='cuda' if torch.cuda.is_available() else 'cpu'

    if transformer is None:
      self.transformer=BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=14).to(self.device)
    else:
      self.transformer=transformer.to(self.device)
      
    self.__seed=seed
    self.seed()

  def seed(self):
    np.random.seed(self.__seed)
    random.seed(self.__seed)
    torch.manual_seed(self.__seed)
    if self.device == 'cuda':
      torch.cuda.manual_seed(self.__seed)
      torch.cuda.manual_seed_all(self.__seed)
      torch.backends.cudnn.enabled = False 
      torch.backends.cudnn.benchmark = False
      torch.backends.cudnn.deterministic = True

  def freeze(self, condition=None):
    if condition is None:
      condition = lambda name : True if 'classifier' in name or 'pooler' in name or '11' in name or '10' in name else False
    for name, param in self.transformer.named_parameters():
      param.requires_grad=condition(name)

  def updater(self,optimizer=None, lr=1e-4, scheduler=None):
    self.optimizer =optimizer
    if self.optimizer is None:
      self.optimizer = AdamW(self.transformer.parameters(), lr=lr, correct_bias=False)
    if scheduler is None:
      max_grad_norm = 1.0
      num_training_steps = 1000
      num_warmup_steps = 100
      self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
      self.clip=max_grad_norm
    elif scheduler != False:
      self.scheduler=scheduler
      

  def train(self,train_dataset, valid_dataset, epochs=1,verbosity=4):
    total_step = len(train_dataset.dataloader)
    verbosity=total_step/verbosity

    for epoch in tqdm_notebook(range(epochs)):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for i, batch in enumerate(train_dataset.dataloader):
          batch = tuple(t.to(self.device) for t in batch)
          b_input_ids, b_input_mask, b_labels = batch
          outputs = self.transformer(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
          loss = outputs[0]
          tr_loss+=loss.item() 
          loss.backward()
          if self.scheduler != False:
          	torch.nn.utils.clip_grad_norm_(self.transformer.parameters(), self.clip)
          self.optimizer.step()
          if self.scheduler != False:
          	self.scheduler.step()
          self.optimizer.zero_grad()
          
          #if i % verbosity == verbosity-1:
            #print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, i+1, total_step, tr_loss/i))

        train_epoch_accuracy = self.evaluate(train_dataset, mode='train')
        valid_epoch_accuracy = self.evaluate(valid_dataset, mode='valid')
        print ('\033[1m'+'Epoch [{}/{}], Train_micro_avg: {:.4f}, Valid_micro_avg: {:.4f}'.format(epoch+1, epochs,train_epoch_accuracy, valid_epoch_accuracy)+'\033[0m')

  def evaluate(self, dataset, mode = 'train'):
    with torch.no_grad():
      correct, total = 0, 0
      true=[]
      for i, batch in enumerate(dataset.dataloader):
        batch = tuple(t.to(self.device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        outputs = self.transformer(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        prediction = torch.argmax(outputs[0],dim=1)
        total += b_labels.size(0)
        true += [b_labels.cpu()]
        correct+=(prediction==b_labels).sum().item()
        if mode == 'test':
          self.predictions.extend(list(np.asarray(prediction.cpu())))
      
      if mode == 'train' or mode == 'valid':
        return (100*correct/total)
      else:
        self.predictions = dataset.le.inverse_transform(self.predictions)
        return None
  
  def predict(self, test_dataset):
    self.evaluate(test_dataset, mode='test')
    return self.predictions

  def logits(self, dataset):
    logits=[]
    with torch.no_grad():
      for i, batch in enumerate(dataset.dataloader):
        batch = tuple(t.to(self.device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        outputs = self.transformer(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits.extend(list(np.asarray(outputs[0].cpu())))
    return logits

In [0]:
class Dataset:
    def __init__(self, articles_folder, labels_file):
        self.articles_folder = articles_folder
        self.labels_file = labels_file
        self.articles = read_articles_from_file_list(articles_folder)
        self.read()

    def read(self):
    	articles_id, span_starts, span_ends, self.gold_labels = read_predictions_from_file(self.labels_file)
    	self.spans = [int(end)-int(start) for start, end in zip(span_starts, span_ends)]
    	print("Read %d annotations from %d articles" % (len(span_starts), len(set(articles_id))))
    	self.sentences=[self.articles[id][int(start):int(end)] for id, start, end in zip(articles_id, span_starts, span_ends)]
    	self.size=len(self.sentences)



class SLDataset(Dataset):
    def __init__(self,  articles_folder=None, labels_file=None, lower=True):
        super().__init__(articles_folder, labels_file)
        self.lower=lower

    def clean(self):
        def text_clean(text):
            if self.lower:
                text=text.lower()
            text=re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
            text=re.sub('[“"”]',' " ',text)
            if self.lower:
                retain='[^abcdefghijklmnopqrstuvwxyz!#?". ]'
            else:
                retain='[^abcdefghijklmnopqrstuvwxyzQWERTYUIOPASDFGHJKLZXCVBNM!#?". ]'
            text=re.sub('[()–-]',' ',text)
            text=re.sub(retain,'',text)
            text=re.sub('[.]',' . ',text)
            text=text.replace('?',' ? ')
            text=text.replace('#',' # ')
            text=text.replace('!',' ! ')
            return ' '.join(text.split())
        
        print("Cleaning Sentences")
        self.sentences=[text_clean(sentence) for sentence in self.sentences]

class TransformerDataset(Dataset):
    def __init__(self, articles_folder=None, labels_file=None):
        super().__init__(articles_folder, labels_file)
        self.clean()
        self.sentences = ["[CLS] " + sentence + " [SEP]" for sentence in self.sentences]
        self.le=LE()
        self.labels=self.le.fit_transform(self.gold_labels)

    def clean(self):
        def text_clean(text):
            text=re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
            text=re.sub('[“"”]',' " ',text)
            retain='[^abcdefghijklmnopqrstuvwxyzQWERTYUIOPASDFGHJKLZXCVBNM!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~.0123456789 ]'
            return ' '.join(text.split())
        
        print("Cleaning Sentences")
        self.sentences=[text_clean(sentence) for sentence in self.sentences]
        
    def tokenize(self, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'), verbosity=True):
        self.tokenizer=tokenizer
        print("Tokenizing")
        self.tokenized_texts = [self.tokenizer.tokenize(sent) for sent in self.sentences]
        if verbosity:
          print("Tokenized \n", self.tokenized_texts[0])
    
    def encode(self, MAX_LEN=90):
      input_ids=[]
      for i in tqdm_notebook(range(len(self.tokenized_texts))):
        input_ids.append(self.tokenizer.convert_tokens_to_ids(self.tokenized_texts[i]))
      
      input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
      attention_masks = []
      # Create a mask of 1s for each token followed by 0s for padding
      for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
      self.inputs, self.masks, self.labels = torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(self.labels)

    def load(self, batch_size=32):
      self.data = TensorDataset(self.inputs, self.masks, self.labels)
      self.dataloader = DataLoader(self.data, shuffle=False, batch_size=batch_size)

In [0]:
train_folder = "datasets/train-articles" # check that the path to the datasets folder is correct, 
dev_folder = "datasets/dev-articles"     # if not adjust these variables accordingly
train_labels_file = "datasets/train-task2-TC.labels"
dev_labels_file = "datasets/dev-task-TC.labels"

In [0]:
dev=TransformerDataset(dev_folder, dev_labels_file)
dev.tokenize()
dev.encode()
dev.load()

In [0]:
train=TransformerDataset(train_folder, train_labels_file)
train.tokenize()
train.encode()
train.load()

In [0]:
model_=TransformerModel()
model_.freeze()
model_.updater(lr=1e-4)
model_.train(train,dev,epochs=4)

In [0]:
PATH="./BERT.pth"
torch.save(model_.transformer.state_dict(), PATH)

In [0]:
bert_dev_probs=model_.logits(dev)
bert_train_probs=model_.logits(train)

In [0]:
ML_train=SLDataset(train_folder, train_labels_file)
ML_dev=SLDataset(dev_folder, dev_labels_file)
ML_train.clean()
ML_dev.clean()
features=FeatureExtraction(ML_train,ML_dev)
features.get_features()

In [0]:
model_lr=LogisticRegression(penalty='l2', class_weight='balanced', solver="liblinear")
model_lr.fit(features.train_features, ML_train.gold_labels)
lr_train=model_lr.predict_proba(features.train_features)
lr_dev=model_lr.predict_proba(features.dev_features)

In [0]:
from sklearn import preprocessing
_bert_train_probs=preprocessing.scale(bert_train_probs,axis=1)
_bert_dev_probs=preprocessing.scale(bert_dev_probs,axis=1)

In [0]:
meta=LogisticRegression(penalty='l2', class_weight='balanced', solver="liblinear")

In [0]:
meta.fit(np.concatenate([lr_train, _bert_train_probs],axis=1), ML_train.gold_labels)
ans=meta.predict(np.concatenate([lr_dev, _bert_dev_probs],axis=1))

In [0]:
print((ans==ML_dev.gold_labels).sum()/ML_dev.size)