***Description***
<div> This notebook contains baseline experiments for this project. First, I use VADER (sentiment) to predict the  subjectivity in the data. However, since subjective materials are not always sentimental, the accuracy were very low. Then, I trained end-to-end SVM and BERT models on the NYTAC finance data (1996 & 2005) to make prediction on NYTAC data from 1986.

In [None]:
#!pip install transformers

In [2]:
import numpy as np
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#from sklearn.utils.class_weight import compute_class_weight
import transformers
from transformers import BertTokenizerFast, BertForSequenceClassification
import glob, os
from tqdm import trange
from torch.nn import functional as F

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/users/rldall/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Import data

### NYTAC

In [45]:
def select_files(path, startwith):
    list_of_files = []
    files = os.listdir(path)
    for file in files:
        if file.startswith(startwith):
            list_of_files.append(str(path)+str(file))
            
    return list_of_files

In [6]:
train_df = pd.read_csv('/data/ProcessedNYT/train_finance.txt', usecols=[0,1], sep='\t', header=None)

In [7]:
train_labels = train_df[0].values
train_X = train_df[1].values

In [48]:
list_of_files = select_files('/data/ProcessedNYT/','test')
list_of_dfs = [pd.read_csv(file, sep='\t', usecols=[0,1], header=None) for file in list_of_files]

In [49]:
val_ratio = 0.2
seed = 32
batch_size = 8

### Webis-editorial-16

Note: All articles in the Webis are editorials, but this is for the purpose of testing on different publishers - to see how our ML models generalize outside the NYT corpus, which they were trained on (Finance, Years 1996 & 2005).

In [39]:
path = '/data/ArgFeatModel/corpus-webis-editorials-16/annotated-txt/split-by-portal-final'
publist = os.listdir(path)

In [40]:
def extract_df(filepath):
    main_df = pd.DataFrame(columns=['unit'])

#    for filename in glob.glob(os.path.join(path, '*.txt')): ###
    with open(os.path.join(os.getcwd(), filepath), 'r') as f: 
        lines = f.readlines()
            #lines.remove('-1\tpar-sep\t\n') ###
        this_lines_df = pd.DataFrame(lines, columns=['unit'])
        main_df = pd.concat([main_df,this_lines_df]) ### ###
        
    main_df = main_df['unit'].str.split('\t',expand=True)
    main_df = main_df[[2]].replace('\n','', regex=True)

    return ' '.join(main_df[2])

In [41]:
# read all publishers into df and keep in list

pub_df_list = []
for pub in publist:
    pub_text = []
    for file in glob.glob(os.path.join(path+'/'+pub, '*.txt')):
        text = ''
        text = extract_df(file)
        pub_text.append(text)
    pub_df = pd.DataFrame({0:1,1:pub_text})
    pub_df_list.append(pub_df)

# End-to-end Sentiment Detection Experiment

## VADER

In [3]:
sid = SentimentIntensityAnalyzer()

In [10]:
def format_output(output_dict):
    polarity = 0
    if(output_dict['compound']>= 0.05):
        polarity = 1
    elif(output_dict['compound']<= -0.05):
        polarity = 1
    return polarity

def predict_sentiment(text):
    output_dict =  sid.polarity_scores(text)
    return format_output(output_dict)

In [11]:
X_sent = []

for x in train_X:
    X_sent.append(predict_sentiment(x))
    
print(classification_report(train_labels, X_sent))

              precision    recall  f1-score   support

           0       0.81      0.02      0.04      2276
           1       0.18      0.98      0.31       507

    accuracy                           0.20      2783
   macro avg       0.50      0.50      0.17      2783
weighted avg       0.70      0.20      0.09      2783



In [22]:
# evaluating the model, using data from different topics/year
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    
    test_labels = df[0].values
    test_X = df[1].values
    test_X_sent = []

    for x in test_X:
        test_X_sent.append(predict_sentiment(x))
    
    print(classification_report(test_labels, test_X_sent))

Evaluating: /data/ProcessedNYT/test_military.txt
              precision    recall  f1-score   support

           0       0.80      0.01      0.01       544
           1       0.10      0.98      0.18        61

    accuracy                           0.11       605
   macro avg       0.45      0.50      0.10       605
weighted avg       0.73      0.11      0.03       605

Evaluating: /data/ProcessedNYT/test_law.txt
              precision    recall  f1-score   support

           0       0.75      0.01      0.02       480
           1       0.21      0.98      0.35       130

    accuracy                           0.22       610
   macro avg       0.48      0.50      0.19       610
weighted avg       0.64      0.22      0.09       610

Evaluating: /data/ProcessedNYT/test_finance.txt
              precision    recall  f1-score   support

           0       0.83      0.01      0.02       446
           1       0.12      0.98      0.22        64

    accuracy                           0.

# End-to-end ML Experiment

## SVM Classifier Experiment

In [59]:
# Indices of the train and validation splits stratified by labels
X_train, X_val, y_train, y_val = train_test_split(
    train_X, train_labels,
    test_size = val_ratio,
    shuffle = True,
    stratify = labels,
    random_state=seed)

#print(len(X_train),len(X_test),len(y_train),len(y_test))

In [60]:
# encoding labels
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_val = Encoder.fit_transform(y_val)

# transform text
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(train_X)
Train_X_Tfidf = Tfidf_vect.transform(X_train)
Val_X_Tfidf = Tfidf_vect.transform(X_val)

In [61]:
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Val_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score: ",accuracy_score(predictions_SVM, y_test))
print(classification_report(predictions_SVM, y_test))

SVM Accuracy Score:  0.926391382405745
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       477
           1       0.69      0.88      0.77        80

    accuracy                           0.93       557
   macro avg       0.84      0.91      0.86       557
weighted avg       0.94      0.93      0.93       557



In [62]:
# cross-genres
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    
    test_tfidf = Tfidf_vect.transform(df[1])
    test_labels = Encoder.fit_transform(df[0])
    
    preds = SVM.predict(test_tfidf)
    print(classification_report(preds, test_labels))

Evaluating: /data/ProcessedNYT/test_military.txt
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       557
           1       0.67      0.85      0.75        48

    accuracy                           0.96       605
   macro avg       0.83      0.91      0.86       605
weighted avg       0.96      0.96      0.96       605

Evaluating: /data/ProcessedNYT/test_law.txt
              precision    recall  f1-score   support

           0       0.95      0.90      0.93       508
           1       0.62      0.78      0.69       102

    accuracy                           0.88       610
   macro avg       0.78      0.84      0.81       610
weighted avg       0.90      0.88      0.89       610

Evaluating: /data/ProcessedNYT/test_finance.txt
              precision    recall  f1-score   support

           0       0.98      0.94      0.95       465
           1       0.53      0.76      0.62        45

    accuracy                           0.

In [64]:
# cross-publishers
for idx,df in enumerate(pub_df_list):
    
    print('Evaluating:',publist[idx])
    
    test_tfidf = Tfidf_vect.transform(df[1])
    test_labels = Encoder.fit_transform(df[0])
    
    preds = SVM.predict(test_tfidf)
    print("SVM Accuracy Score:",accuracy_score(preds, test_labels))

Evaluating: guardian
SVM Accuracy Score: 0.64
Evaluating: foxnews
SVM Accuracy Score: 0.59
Evaluating: aljazeera
SVM Accuracy Score: 0.74


## BERT

In [23]:
# specify GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
max_seq_len = 256

# import BERT-base pretrained model
###bert = AutoModel.from_pretrained('bert-base-cased')

# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 2e-5,
                              #eps = 1e-08
                              )

model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [28]:
def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 256,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [29]:
token_id = []
attention_masks = []

for sample in train_X:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
#labels = torch.tensor(labels)
labels = torch.tensor(train_labels.astype(np.int64))
labels = F.one_hot(labels, num_classes=2)
labels = labels.float()

In [30]:
# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels,
    random_state=seed)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

val_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 3

for _ in trange(epochs, desc = 'Epoch'):

    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # Set model to evaluation mode
    model.eval()

    logits_list = []
    labels_list = []

    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # save for report
        logits_list.extend(logits)
        labels_list.extend(label_ids)

Epoch: 100%|██████████| 3/3 [03:01<00:00, 60.35s/it]


In [33]:
# get classification report
preds_list = list(np.argmax(logits_list,axis=1))
labels_list = list(np.argmax(labels_list,axis=1))
  
print(classification_report(labels_list, preds_list))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94       456
           1       0.70      0.91      0.79       101

    accuracy                           0.91       557
   macro avg       0.84      0.91      0.87       557
weighted avg       0.93      0.91      0.92       557



In [34]:
#torch.save(model.state_dict(), 'saved_weights_end2end.pt')

#### Cross-genre testsets

In [35]:
def get_testdataloader(df,tokenzier):
    
    test_labels = df[0].values
    test_text = df[1].values
    
    test_ids = []
    test_attention_mask = []
    

    for sample in test_text:
        encoding_dict = preprocessing(sample, tokenizer)
        test_ids.append(encoding_dict['input_ids']) 
        test_attention_mask.append(encoding_dict['attention_mask'])
        
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)
    test_labels = torch.tensor(test_labels.astype(np.int64))
    test_labels = F.one_hot(test_labels, num_classes=2)
    test_labels = test_labels.float()

    test_set = TensorDataset(test_ids, 
                          test_attention_mask, 
                          test_labels)
    
    return DataLoader(
            test_set,
            sampler = RandomSampler(test_set),
            batch_size = batch_size
        )

In [36]:
# Test Data

def test_model(test_dataloader, model):

    model.eval()

    test_accuracy = []
    logits_list = []
    labels_list = []

    for batch in test_dataloader:
    
        if step % 50 == 0 and not step == 0:
        # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(test_dataloader)))

    
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            test_output = model(b_input_ids, 
                                token_type_ids = None, 
                                attention_mask = b_input_mask)
        logits = test_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
    
        logits_list.extend(logits)
        labels_list.extend(label_ids)


    # get classification report
    preds_list = list(np.argmax(logits_list,axis=1))
    labels_list = list(np.argmax(labels_list,axis=1))
    
    print(classification_report(labels_list, preds_list))
    
    #return preds_list, labels_list

In [30]:
# test instance
test_0 = get_testdataloader(file_df[0],tokenizer)
test_model(test_0,model)

In [37]:
# test across all available NYTAC genres

for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    test_dataloader = get_testdataloader(df,tokenizer)
    test_model(test_dataloader,model)
    
    torch.cuda.empty_cache()
    
    print('\n')

Evaluating: /data/ProcessedNYT/test_military.txt




              precision    recall  f1-score   support

           0       1.00      0.93      0.96       544
           1       0.62      0.97      0.76        61

    accuracy                           0.94       605
   macro avg       0.81      0.95      0.86       605
weighted avg       0.96      0.94      0.94       605



Evaluating: /data/ProcessedNYT/test_law.txt




              precision    recall  f1-score   support

           0       0.98      0.88      0.92       480
           1       0.67      0.92      0.78       130

    accuracy                           0.89       610
   macro avg       0.82      0.90      0.85       610
weighted avg       0.91      0.89      0.89       610



Evaluating: /data/ProcessedNYT/test_finance.txt




              precision    recall  f1-score   support

           0       1.00      0.93      0.96       446
           1       0.68      0.97      0.80        64

    accuracy                           0.94       510
   macro avg       0.84      0.95      0.88       510
weighted avg       0.96      0.94      0.94       510



Evaluating: /data/ProcessedNYT/test_education.txt




              precision    recall  f1-score   support

           0       0.99      0.87      0.93       242
           1       0.62      0.94      0.75        54

    accuracy                           0.89       296
   macro avg       0.80      0.91      0.84       296
weighted avg       0.92      0.89      0.89       296



Evaluating: /data/ProcessedNYT/test_politics.txt




              precision    recall  f1-score   support

           0       0.99      0.92      0.96      1166
           1       0.69      0.94      0.79       208

    accuracy                           0.93      1374
   macro avg       0.84      0.93      0.87      1374
weighted avg       0.94      0.93      0.93      1374



Evaluating: /data/ProcessedNYT/test_medicine.txt




              precision    recall  f1-score   support

           0       1.00      0.89      0.94       244
           1       0.69      0.98      0.81        59

    accuracy                           0.91       303
   macro avg       0.84      0.94      0.88       303
weighted avg       0.94      0.91      0.92       303





In [43]:
# test across all available publishers in Webis-editorial-16

for idx,df in enumerate(pub_df_list):
    
    print('Evaluating:',publist[idx])
    test_dataloader = get_testdataloader(df,tokenizer)
    test_model(test_dataloader,model)
    
    torch.cuda.empty_cache()
    
    print('\n')

Evaluating: guardian


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.79      0.88       100

    accuracy                           0.79       100
   macro avg       0.50      0.40      0.44       100
weighted avg       1.00      0.79      0.88       100



Evaluating: foxnews


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.75      0.86       100

    accuracy                           0.75       100
   macro avg       0.50      0.38      0.43       100
weighted avg       1.00      0.75      0.86       100



Evaluating: aljazeera
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.62      0.77       100

    accuracy                           0.62       100
   macro avg       0.50      0.31      0.38       100
weighted avg       1.00      0.62      0.77       100





  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
