In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
import re
import warnings
warnings.filterwarnings("ignore")
from tqdm.auto import tqdm
import os
import sys
import torch
import csv
import numpy as np
import random
import time
import datetime
import pprint
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import BertTokenizer
pd.set_option('max_colwidth',150)
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 4
EPOCHS = 20
LEARNING_RATE = 1e-05

In [2]:
from torch.nn import CrossEntropyLoss, MSELoss,BCEWithLogitsLoss

class ContextualBertForSequenceClassification(torch.nn.Module):
  
  def __init__(self, num_labels, ContextModel, SpanModel):
    super(ContextualBertForSequenceClassification, self).__init__()
    self.ContextModel = ContextModel
    self.SpanModel = SpanModel
    self.num_labels = num_labels

    # self.classifier = torch.nn.Linear(768*2, num_labels)
    # self.classifier1 = torch.nn.Linear(768, num_labels)
    self.classifier2 = torch.nn.Linear(768+128, num_labels)
    self.reduce_classifier = torch.nn.Linear(768, 128)
    self.dropout = torch.nn.Dropout(0.1)

  def forward(
      self,
      span_input_ids,
      span_attention_mask,
      context_input_ids,
      context_attention_mask,
      labels=None
  ):
    context_outputs = self.ContextModel(
        input_ids=context_input_ids,
        attention_mask=context_attention_mask
    )
    context_outputs = context_outputs[1] # pooler output
    span_outputs = self.SpanModel(
        input_ids=span_input_ids,
        attention_mask=span_attention_mask
    )
    span_outputs = span_outputs[1]

    context_outputs = self.reduce_classifier(context_outputs)
    pooled_output = torch.cat((span_outputs, context_outputs), axis=1)

    pooled_output = self.dropout(pooled_output)

    logits = self.classifier2(pooled_output)
    outputs = (logits,)
    if labels is not None:
      if self.num_labels == 1:
        loss_fct = MSELoss()
        loss = loss_fct(logits.view(-1), labels.view(-1))
      else:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
      outputs = (loss,) + outputs

    return outputs

In [3]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
#from transformers import RobertaModel
from transformers import BertModel
#from transformers import RobertaForSequenceClassification
import time,sys
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model_name = 'bert-base-cased'
context_model = BertModel.from_pretrained(model_name)
span_model = BertModel.from_pretrained(model_name)
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

model_binary = ContextualBertForSequenceClassification(1, context_model, span_model)
model_binary.cuda()
model_binary.load_state_dict(torch.load('binary.pth'))
model_binary.eval()

ContextualBertForSequenceClassification(
  (ContextModel): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), 

In [25]:
from sklearn.metrics import classification_report
def get_model_predictions(model, dataloader,mode = 'multi'):
  model.eval()
  predictions , true_labels = [], []
  nb_eval_steps = 0
  for batch in dataloader:
    b_input_ids = batch['ids'].to(device)
    b_labels = batch['targets'].to(device)
    b_input_mask = batch['mask'].to(device)
    b_c_input_ids = batch['c_ids'].to(device)
    b_c_input_mask = batch['c_mask'].to(device)
    
    with torch.no_grad():        
      logits = model(b_input_ids, 
                     b_input_mask,
                     b_c_input_ids, 
                     b_c_input_mask)
    logits = logits[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    pred_label = np.argmax(logits, axis=1) if mode =='multi' else [1 if a >0.5 else 0 for a in logits]
    predictions.extend(pred_label)
    true_labels.extend(label_ids)
  return predictions, true_labels


def show_results(df,model,loader):
    pred,true = get_model_predictions(model, loader,'binary')
    
    accuracy = sum([a==b for a,b in zip(pred,true)])/len(pred)
    print('Accuracy on validation set:',accuracy)
    predvstrue = {'full_sentence':df.full_sent,'pred':pred,'true':true}
    display(pd.DataFrame(predvstrue).sample(15,random_state=126))
    #tar_names = list(mapping.keys())[:6]
    print(classification_report(pred,true
                                
                                #,target_names = tar_names
                                
                               ))

In [5]:
class Triage_binary(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.full_sent[index])
        title = " ".join(title.split())
        context = str(self.data.context[index])
        context = " ".join(context.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        tokenized_context = self.tokenizer.encode_plus(context,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            pad_to_max_length=True,
                                            return_attention_mask=True,truncation = True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        c_ids = tokenized_context['input_ids']
        c_mask = tokenized_context['attention_mask']
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.binary[index], dtype=torch.long),
            'c_ids':torch.tensor(c_ids, dtype=torch.long),
            'c_mask': torch.tensor(c_mask,dtype= torch.long)
        } 
    
    def __len__(self):
        return self.len

In [6]:
def demo(sent):
    label = 0
    datf = pd.DataFrame(np.array([[sent,label,sent]])
                   ,columns = ['full_sent','binary','context'])
    
    datf.binary = datf.binary.astype('int')
    present = Triage_binary(datf,tokenizer,MAX_LEN)
    pre_loader = DataLoader(present,**test_params)
    pred,true = get_model_predictions(model_binary, pre_loader,'binary')
   
    if pred[0] == 1:
        
        return 'Propaganda.'
    else:
       return 'Non-propaganda.'

In [7]:
model_binary

ContextualBertForSequenceClassification(
  (ContextModel): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), 

In [8]:
num = sum([param.nelement() for param in model_binary.parameters()])
print('Total parameters of model',num)

Total parameters of model 216719873


In [38]:
demo('The weather today is very nice.')

'Non-propaganda.'

In [39]:
demo('We want you! Join the military.')

'Propaganda.'

In [35]:
Trump_tweets = ["An 'extremely credible' source has called my office and told me that @BarackObama's birth certificate is a fraud.",
                "Sadly, because president Obama has done such a poor job as a president, you won't see another black president for generations!",
                'RIGGED ELECTION!',
                "On behalf of the entire Trump Family, I want to wish everyone a healthy and happy thanksgiving. ",
               'The concept of global warming was created by and for the Chinese in order to make U.S. manufacturing non-competitive.',
               'I have never seen a thin person drinking Diet Coke.'
                
               ]

In [40]:
for tweets in Trump_tweets:
    print('Sentence:',tweets,'\n','Classification:',demo(tweets),'\n')

Sentence: An 'extremely credible' source has called my office and told me that @BarackObama's birth certificate is a fraud. 
 Classification: Propaganda. 

Sentence: Sadly, because president Obama has done such a poor job as a president, you won't see another black president for generations! 
 Classification: Propaganda. 

Sentence: RIGGED ELECTION! 
 Classification: Propaganda. 

Sentence: On behalf of the entire Trump Family, I want to wish everyone a healthy and happy thanksgiving.  
 Classification: Non-propaganda. 

Sentence: The concept of global warming was created by and for the Chinese in order to make U.S. manufacturing non-competitive. 
 Classification: Non-propaganda. 

Sentence: I have never seen a thin person drinking Diet Coke. 
 Classification: Non-propaganda. 



In [13]:
def demo_scrape():
    scrape = pd.read_excel('External Data colection.xlsx',keep_default_na = False).reset_index()
    scrape.Label = scrape.Label.apply(lambda x: 1 if x == 'x' else 0)
    scrape.columns = ['index','full_sent','binary']
    scrape['context'] = scrape.full_sent
    present = Triage_binary(scrape,tokenizer,MAX_LEN)
    s_loader = DataLoader(present,**test_params)
    show_results(scrape,model_binary,s_loader)

In [41]:
demo_scrape()

Accuracy on validation set: 0.7184466019417476


Unnamed: 0,full_sentence,pred,true
76,"And that is devastating, no matter who ends up in the White House.”",0,0
177,"Trump, he said, lied throughout the debate, including a brazenly false claim that the nation was “rounding the turn” on the coronavirus pandemic d...",0,0
8,"In Vietnam this week, rainfall described as “extraordinarily out of the normal”—so heavy that “it far exceeded the government’s midrange predictio...",0,1
38,"On Monday, the Global Times published a piece suggesting that the American political system would collapse shortly after the election.",0,0
157,That’s why they are forking over cash hand over fist to reelect the most openly racist president in modern history.”,0,0
79,"Left-leaning daily Liberation noted that while the election could still go either way, “Trumpsim is here, alive and well”, and even if he took the...",0,0
200,"Despite being pressed by many advocates who were alarmed at these delays, USCIS refused to transition to virtual platforms to continue operations,...",0,0
60,Forget voter ID voter IQ needs sorting.,0,0
115,"Conversely, of the voters (42 percent) who say rebuilding the economy is more important now, even to the detriment of efforts to contain the Chine...",0,0
181,"“So, he’s effectively throwing them under the bus there, too,” Oliver said.",0,0


              precision    recall  f1-score   support

           0       0.66      0.86      0.74        98
           1       0.82      0.59      0.69       108

    accuracy                           0.72       206
   macro avg       0.74      0.72      0.72       206
weighted avg       0.74      0.72      0.71       206

