In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from collections import Counter
import re
import warnings
warnings.filterwarnings("ignore")
from tqdm.auto import tqdm
import os
import sys
import torch
import csv
import numpy as np
import random
import time
import datetime
import pprint
pd.set_option('max_colwidth',150)

In [2]:
from torch.nn import CrossEntropyLoss, MSELoss,BCEWithLogitsLoss

class ContextualBertForSequenceClassification(torch.nn.Module):
  
  def __init__(self, num_labels, ContextModel, SpanModel):
    super(ContextualBertForSequenceClassification, self).__init__()
    self.ContextModel = ContextModel
    self.SpanModel = SpanModel
    self.num_labels = num_labels

    # self.classifier = torch.nn.Linear(768*2, num_labels)
    # self.classifier1 = torch.nn.Linear(768, num_labels)
    self.classifier2 = torch.nn.Linear(768+128, num_labels)
    self.reduce_classifier = torch.nn.Linear(768, 128)
    self.dropout = torch.nn.Dropout(0.1)

  def forward(
      self,
      span_input_ids,
      span_attention_mask,
      context_input_ids,
      context_attention_mask,
      labels=None
  ):
    context_outputs = self.ContextModel(
        input_ids=context_input_ids,
        attention_mask=context_attention_mask
    )
    context_outputs = context_outputs[1] # pooler output
    span_outputs = self.SpanModel(
        input_ids=span_input_ids,
        attention_mask=span_attention_mask
    )
    span_outputs = span_outputs[1]

    context_outputs = self.reduce_classifier(context_outputs)
    pooled_output = torch.cat((span_outputs, context_outputs), axis=1)

    pooled_output = self.dropout(pooled_output)

    logits = self.classifier2(pooled_output)
    outputs = (logits,)
    if labels is not None:
      if self.num_labels == 1:
        loss_fct = MSELoss()
        loss = loss_fct(logits.view(-1), labels.view(-1))
      else:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
      outputs = (loss,) + outputs

    return outputs

In [3]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
#from transformers import RobertaModel
from transformers import BertModel
#from transformers import RobertaForSequenceClassification
import time,sys

model_name = 'bert-base-cased'
context_model = BertModel.from_pretrained(model_name)
span_model = BertModel.from_pretrained(model_name)
model = ContextualBertForSequenceClassification(19, context_model, span_model)
model.cuda()
model.load_state_dict(torch.load('19class.pth'))
model.eval()
"""model_binary = ContextualBertForSequenceClassification(1, context_model, span_model)
model_binary.cuda()
model_binary.load_state_dict(torch.load('binary.pth'))
model_binary.eval()"""

"model_binary = ContextualBertForSequenceClassification(1, context_model, span_model)\nmodel_binary.cuda()\nmodel_binary.load_state_dict(torch.load('binary.pth'))\nmodel_binary.eval()"

In [4]:
from pathlib import Path
from preprocess_c import *
train_path  = 'data/data_propoganda/data/protechn_corpus_eval/train'
test_path = 'data/data_propoganda/data/protechn_corpus_eval/test'
dev_path = 'data/data_propoganda/data/protechn_corpus_eval/dev'

def make_dset(path):
    path_ = Path(path)
    a = make_dataset(path_)
    df_1 = pd.DataFrame(columns=['id','full_sent','start_sent','end_sent','start_prop','end_prop','prop','??','???'])
    for dm in a:
        df_t = pd.DataFrame(dm,columns =['id','full_sent','start_sent','end_sent','start_prop','end_prop','prop','??','???'] )
        df_1 = df_1.append(df_t,ignore_index= True)
    return df_1.iloc[:,:-2]

df_train = make_dset(train_path)
df_test = make_dset(test_path)

In [5]:
binary_tagging = [0 if i == 'O' else 1 for i in df_train.prop.values ]
df_train['binary'] = binary_tagging

In [6]:
mapping = {'O':0,'Loaded_Language':1,'Name_Calling,Labeling':2,'Repetition':3,
           'Exaggeration,Minimisation':4,'Doubt':5,'Appeal_to_fear-prejudice':6,'Flag-Waving':7,'Causal_Oversimplification':8,
           'Slogans':9,'Appeal_to_Authority':10,'Black-and-White_Fallacy':11,'Thought-terminating_Cliches':12,'Whataboutism':13,
           'Reductio_ad_hitlerum':14,'Red_Herring':15,'Bandwagon':16,'Obfuscation,Intentional_Vagueness,Confusion':17,'Straw_Men':18}
#df_train = df_train[df_train.binary !=0]
rev_mapping = {v:k for k,v in mapping.items()}


df_train['prop_1'] = df_train.prop.apply(lambda x: mapping[x])
df_test['prop_1'] = df_test.prop.apply(lambda x: mapping[x])
df_train

Unnamed: 0,id,full_sent,start_sent,end_sent,start_prop,end_prop,prop,binary,prop_1
0,111111112,Pamela Geller and Robert Spencer co-founded anti-Muslim group Stop Islamization of America.,129,220,191,220,Slogans,1,9
1,111111112,"He added: ""We condemn all those whose behaviours and views run counter to our shared values and will not stand for extremism in any form.""",465,603,476,556,Black-and-White_Fallacy,1,11
2,111111112,"Ms Geller, of the Atlas Shrugs blog, and Mr Spencer, of Jihad Watch, are also co-founders of the American Freedom Defense Initiative, best known f...",622,838,785,798,Slogans,1,9
3,111111112,"On both of their blogs the pair called their bans from entering the UK ""a striking blow against freedom"" and said the ""the nation that gave the wo...",839,1014,911,942,Loaded_Language,1,1
4,111111112,"On both of their blogs the pair called their bans from entering the UK ""a striking blow against freedom"" and said the ""the nation that gave the wo...",839,1014,958,1014,Loaded_Language,1,1
...,...,...,...,...,...,...,...,...,...
15745,999001621,This is a Moon of Alabama fundraiser week.,12271,12313,0,0,O,0,0
15746,999001621,No one pays me to write these blog posts.,12314,12355,0,0,O,0,0
15747,999001621,"If you appreciated this one, or any of the 7,000+ others, please consider a donation.",12356,12441,0,0,O,0,0
15748,999001621,"Posted by b on November 29, 2018 at 10:23 AM | Permalink",12442,12498,0,0,O,0,0


In [7]:
import glob
import os
train_direct = glob.glob('data/data_propoganda/data/protechn_corpus_eval/train/*.txt')
test_direct = glob.glob('data/data_propoganda/data/protechn_corpus_eval/test/*.txt')

def read_articles(dire,mode = 'train'):
  articles = []
  
  for filename in dire:
      myfile = open(filename,encoding='utf8')
      article = myfile.read()
      articles.append(article)
      myfile.close()
  article_ids = []
  
  for filename in dire:
    if mode =='train':
      article_ids.append(filename[60:-4])
    else:
      article_ids.append(filename[59:-4])
  
  return articles, article_ids
articles,art_ids = read_articles(train_direct)

articles_t,art_ids_t = read_articles(test_direct,'test')
id2art ={i:a for a,i in zip(articles,art_ids)}
id2art_t = {i:a for a,i in zip(articles_t,art_ids_t)}

In [8]:
def get_context(article, span, mode='sentence',set = 'train'):
  article = id2art[article] if set =='train' else id2art_t[article]
  def get_num_words(sentence):
    return len(sentence.split(' '))
  if mode == "title":
    return article.split('\n')[0]
  if mode == "sentence":
    WORD_LEN_LIMIT = 120
    li = span[0]
    ri = span[1]
    span_text = article[li: ri]
    num_words = get_num_words(span_text)
    if num_words >= WORD_LEN_LIMIT:
      return span_text
    remaining_len = WORD_LEN_LIMIT - num_words
    lhs_words = remaining_len // 2
    rhs_words = remaining_len - lhs_words
    li -= 1
    lcount = 0
    while li >= 0 and article[li-1] != '\n' and lcount < lhs_words:
      if article[li] == ' ':
        lcount += 1
      li -= 1
    ri += 1
    rcount = 0
    while ri < len(article) and article[ri] != '\n' and rcount < rhs_words:
      if article[ri] == ' ':
        rcount += 1
      ri += 1
    return article[li+1: ri - 1] 

  return ""
spans_1 = [(i,k,j) for i,k,j in zip(df_train.id,df_train.start_sent,df_train.end_sent)]
spans_2 = [(i,k,j) for i,k,j in zip(df_test.id,df_test.start_sent,df_test.end_sent)]
df_train['context'] = [get_context(i,(s,e)) for i,s,e in spans_1]
df_test['context'] = [get_context(i,(s,e),set='test') for i,s,e in spans_2]

In [9]:
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import BertTokenizer

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [10]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 4
EPOCHS = 20
LEARNING_RATE = 1e-05

In [11]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.full_sent[index])
        title = " ".join(title.split())
        context = str(self.data.context[index])
        context = " ".join(context.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        tokenized_context = self.tokenizer.encode_plus(context,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            pad_to_max_length=True,
                                            return_attention_mask=True,truncation = True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        c_ids = tokenized_context['input_ids']
        c_mask = tokenized_context['attention_mask']
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.prop_1[index], dtype=torch.long),
            'c_ids':torch.tensor(c_ids, dtype=torch.long),
            'c_mask': torch.tensor(c_mask,dtype= torch.long)
        } 
    
    def __len__(self):
        return self.len

In [12]:
class Triage_binary(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.full_sent[index])
        title = " ".join(title.split())
        context = str(self.data.context[index])
        context = " ".join(context.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        tokenized_context = self.tokenizer.encode_plus(context,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            pad_to_max_length=True,
                                            return_attention_mask=True,truncation = True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        c_ids = tokenized_context['input_ids']
        c_mask = tokenized_context['attention_mask']
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.binary[index], dtype=torch.long),
            'c_ids':torch.tensor(c_ids, dtype=torch.long),
            'c_mask': torch.tensor(c_mask,dtype= torch.long)
        } 
    
    def __len__(self):
        return self.len

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

In [14]:
def make_loader(seed=126):
    test_set = df_train.sample(32,random_state = seed).reset_index()
    print("TEST Dataset: {}".format(test_set.shape))
    
    testing_set = Triage(test_set, tokenizer, MAX_LEN)
    #testing_set_binary = Triage_binary(test_set,tokenizer,MAX_LEN)
    



    testing_loader = DataLoader(testing_set, **test_params)
    #testing_loader_bin  =DataLoader(testing_set_binary, **test_params)
    return test_set,testing_loader

In [15]:
test_set,testing_loader = make_loader()

TEST Dataset: (32, 11)


In [16]:

def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

def get_model_predictions(model, dataloader,mode = 'multi'):
  model.eval()
  predictions , true_labels = [], []
  nb_eval_steps = 0
  for batch in dataloader:
    b_input_ids = batch['ids'].to(device)
    b_labels = batch['targets'].to(device)
    b_input_mask = batch['mask'].to(device)
    b_c_input_ids = batch['c_ids'].to(device)
    b_c_input_mask = batch['c_mask'].to(device)
    
    with torch.no_grad():        
      logits = model(b_input_ids, 
                     b_input_mask,
                     b_c_input_ids, 
                     b_c_input_mask)
    logits = logits[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    pred_label = np.argmax(logits, axis=1) if mode =='multi' else [1 if a >0.5 else 0 for a in logits]
    predictions.extend(pred_label)
    true_labels.extend(label_ids)
  return predictions, true_labels



In [17]:
from sklearn.metrics import classification_report

def show_results(df,model,loader):
    pred,true = get_model_predictions(model, loader)
    
    accuracy = sum([a==b for a,b in zip(pred,true)])/len(pred)
    print('Accuracy on validation set:',accuracy)
    predvstrue = {'full_sentence':df.full_sent,'pred':pred,'true':true}
    predvstrue = pd.DataFrame(predvstrue)
    display(predvstrue)
    predvstrue['pred_lab'] = predvstrue.pred.apply(lambda x: rev_mapping[x])
    predvstrue['true_lab'] = predvstrue.true.apply(lambda x: rev_mapping[x])
    
    
    print(classification_report(predvstrue.true_lab,predvstrue.pred_lab
                                
                               
                                
                               ))
    

In [30]:
show_results(test_set,model,testing_loader)

Accuracy on validation set: 0.90625


Unnamed: 0,full_sentence,pred,true
0,"No, that’s Satan.",0,0
1,How in the world can we trust them even as they express bias while investigating malfeasance?,3,2
2,"And again in 1066, rioting Muslims, enraged by the humiliation of a Jew who had been appointed to rule over Muslims, murdered four thousand Jews i...",5,5
3,"The unnamed US citizen assigned to the consulate in Guangzhou had reported a variety of ""physical symptoms"" dating from late 2017 to April this ye...",0,0
4,"The immigrant employment index, set to 100.0 in January 2009, fell to 124.7 from 129.6 in April.",1,1
5,Habib-Powell had attended the Iftar dinner with members of Muslim Brotherhood front groups.,0,0
6,Who should replace Nikki Haley as our ambassador to the U.N.?,0,0
7,More than 100 Russian individuals and companies have been sanctioned for a variety of reasons.,0,0
8,"This doltish and dimwitted document wreaks of psycho babble, insults the intelligence of young people, and will destroy the future of the Church.",0,0
9,"His findings, he wrote, do “not substantiate the notion that the vote was motivated by anti-Semitism,” which he defined as hostility toward or dis...",0,0


                           precision    recall  f1-score   support

                    Doubt       1.00      1.00      1.00         3
Exaggeration,Minimisation       0.00      0.00      0.00         1
          Loaded_Language       1.00      0.75      0.86         4
    Name_Calling,Labeling       0.00      0.00      0.00         1
                        O       1.00      1.00      1.00        22
               Repetition       0.50      1.00      0.67         1

                 accuracy                           0.91        32
                macro avg       0.58      0.62      0.59        32
             weighted avg       0.92      0.91      0.91        32



In [None]:
test_set,testing_loader = make_loader(seed=1236)
show_results(test_set,model,testing_loader)

In [22]:
def demo(sent,label=-1):
    
    datf = pd.DataFrame(np.array([[sent,label,sent]])
                   ,columns = ['full_sent','prop_1','context'])
    
    datf.prop_1 = datf.prop_1.astype('int')
    present = Triage(datf,tokenizer,MAX_LEN)
    pre_loader = DataLoader(present,**test_params)
    pred,true = get_model_predictions(model, pre_loader,'multi')
    
    classify = rev_mapping[pred[0]]
    
    if label ==-1:
        return str(classify)

    

In [27]:
Trump_tweets = ["An 'extremely credible' source has called my office and told me that @BarackObama's birth certificate is a fraud.",
                "Sadly, because president Obama has done such a poor job as a president, you won't see another black president for generations!",
                'RIGGED ELECTION!',
                "On behalf of the entire Trump Family, I want to wish everyone a healthy and happy thanksgiving. ",
               'The concept of global warming was created by and for the Chinese in order to make U.S. manufacturing non-competitive.',
               'I have never seen a thin person drinking Diet Coke.'
                
               ]

In [31]:
for tweets in Trump_tweets:
    print('Sentence:',tweets,'\n','Classification:',demo(tweets),'\n')

Sentence: An 'extremely credible' source has called my office and told me that @BarackObama's birth certificate is a fraud. 
 Classification: Exaggeration,Minimisation 

Sentence: Sadly, because president Obama has done such a poor job as a president, you won't see another black president for generations! 
 Classification: Exaggeration,Minimisation 

Sentence: RIGGED ELECTION! 
 Classification: Loaded_Language 

Sentence: On behalf of the entire Trump Family, I want to wish everyone a healthy and happy thanksgiving.  
 Classification: O 

Sentence: The concept of global warming was created by and for the Chinese in order to make U.S. manufacturing non-competitive. 
 Classification: O 

Sentence: I have never seen a thin person drinking Diet Coke. 
 Classification: O 



In [None]:
random_sent = [('Until forced to act by a worldwide storm of outrage','Loaded_Language'), 
               ('Can the same be said for the Obama Administration?','Doubt'),
               ('"BUILD THE WALL!” Trump tweeted','Slogans'),
               ('Heal the situation of extremely grave immoral behavior','Exaggeration, minimization'),
               ('Dismissing the protesters as “lefties” and hugging Barros publicly','Name_Calling, Labeling')
              ]

In [None]:
for s,l in random_sent:
    print('Sentence:',s,'\n','Classification:',demo(s),'\n','True label:',l,'\n')