***Description***

<div> In this notebook, I test one of the models (`af6_sent_pos`) on the Webis-editorial-16 corpus. The corpus itself only 1 (aka editorial) label; however, this is for the purpose of testing the model on different dataset, other than NYTAC. The model makes predictions based on 3 inputs: fine-grained 6 argumentation features, sentence-level sentiment, and sentence-level POS tags counts. Various helper functions are installed and the model is imported. The result is printed on the last cell.

In [1]:
# All packages
import nltk
import numpy as np
import glob, os
import pandas as pd
import torch

from collections import Counter
from glob import glob
from nltk import word_tokenize, StanfordTagger
from nltk.data import load
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn import preprocessing
from transformers import BertTokenizerFast, BertForSequenceClassification

# NLTK
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('tagsets')

# import VADER
sid = SentimentIntensityAnalyzer()

# import POS tag_dict and label encoder
tagdict = load('help/tagsets/upenn_tagset.pickle')
le = preprocessing.LabelEncoder()
le.fit(list(tagdict.keys()))

# keras
import keras
from keras import Input, Model
from keras import backend as K
from keras.constraints import maxnorm
from keras.models import Sequential
from keras.layers import Bidirectional, Concatenate, Embedding, Dense, Dropout, InputLayer, Reshape, SimpleRNN, BatchNormalization, TimeDistributed, Lambda, Activation, MaxPooling1D
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
from keras.utils import np_utils

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/users/rldall/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/rldall/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/users/rldall/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /home/users/rldall/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


# Test on Webis-editorial-16 corpus

Note: All articles in the Webis are editorials, but this is for the purpose of testing on different publishers - to see how our ML models generalize outside the NYT corpus, which they were trained on (Finance, Years 1996 & 2005).

#### Import data

In [2]:
path = '/data/ArgFeatModel/corpus-webis-editorials-16/annotated-txt/split-by-portal-final'
publist = os.listdir(path)

In [3]:
def extract_df(filepath):
    main_df = pd.DataFrame(columns=['unit'])

#    for filename in glob.glob(os.path.join(path, '*.txt')): ###
    with open(os.path.join(os.getcwd(), filepath), 'r') as f: 
        lines = f.readlines()
            #lines.remove('-1\tpar-sep\t\n') ###
        this_lines_df = pd.DataFrame(lines, columns=['unit'])
        main_df = pd.concat([main_df,this_lines_df]) ### ###
        
    main_df = main_df['unit'].str.split('\t',expand=True)
    main_df = main_df[[2]].replace('\n','', regex=True)

    return ' '.join(main_df[2])

In [4]:
# read all publishers into df and keep in list

pub_df_list = []
for pub in publist:
    pub_text = []
    for file in glob(os.path.join(path+'/'+pub, '*.txt')):
        text = ''
        text = extract_df(file)
        pub_text.append(text)
    pub_df = pd.DataFrame({0:1,1:pub_text})
    pub_df_list.append(pub_df)

In [5]:
# Helper function to tokenzie text
def tokenize_input(df):
    # import data
    text_series = df[1]
    text_token = []
    # tokenize sentences
    for t in text_series:
        sent_token = sent_tokenize(t)
        text_token.append(sent_token)
    # new column
    df[2] = text_token
    return df

# Helper function for padding
def padding_X(X):    
    return sequence.pad_sequences(X, maxlen=100)

# Helper functions to extract VADER sentiment
def format_sent(compound_score):
    polarity = 0
    if(compound_score>= 0.05):
        polarity = 1
    elif(compound_score<= -0.05):
        polarity = -1
    return polarity

def get_scores(text):
    scores = sid.polarity_scores(text)
    return np.array([scores.get(s) for s in scores])

def get_sentiment(df,series_col,df_idx):
    series = df[series_col]
    error_list = []
    compound_list = []
    sum_list = []
    
#    for article in series:
    for idx in range(len(series)):
        article = series.iloc[idx]       
        try:
            scores = [get_scores(text) for text in article]
            compound_list.append([s[-1] for s in scores])        
            sum_list.append([format_sent(s[-1]) for s in scores])
            
        except:
            print('Error line:',idx)
            error_list.append(idx)

    # new column
    df['sent_compound'] = compound_list
    df['sent_sum'] = sum_list
    
    df = df.drop(error_list)
#    df.to_csv(list_of_files[df_idx],sep='\t',header=False,index=False)
#    print('Saved\t', list_of_files[df_idx].split('/')[-1])
    
    return df

# Helper functions to extract POS tags
def predict_pos(text):
    text_tok = nltk.word_tokenize(text)
    return [word_class for word, word_class in nltk.pos_tag(text_tok)]

def get_pos(df,series_col,df_idx):
    series = df[series_col]
    error_list = []
    pos_list = []
    for idx in range(len(series)):
        article = series.iloc[idx]
        try:
            article_pos = [predict_pos(sent) for sent in article]
            pos_list.append(article_pos)
        except:
            print('Error line:',idx)
            error_list.append(idx)
    # new column
    df['pos'] = pos_list
    df = df.drop(error_list)
#    df.to_csv(list_of_files[df_idx],sep='\t',header=False,index=False)
#    print('Saved\t', list_of_files[df_idx].split('/')[-1])
    return df

# Helper function for POS tagger
# POS count
def counter_pos(article):
    a =[]  
    for idx,sent_pos in enumerate(article):
        count_pos = Counter(sent_pos)
        a.append(dict(count_pos))
    return a
        
def pos_count_article(counter_result, pos_index):
    article_pos_count_array = np.zeros(shape=(MAXLEN,len(le.classes_)))
    for art_i,sent_pos_count in enumerate(counter_result):
        if art_i >= MAXLEN:        
            pass
        else:
            for pos_item in sent_pos_count:
                try:
                    item_idx = pos_index.index(pos_item)
                    article_pos_count_array[art_i,item_idx] = sent_pos_count.get(pos_item)
                except:
                    pass
    return article_pos_count_array

# Helper function for padding
def padding_X(X):    
    return sequence.pad_sequences(X, maxlen=100)

# Helper functions to extract argfeat prediction
def load_model(name):
    # name in form of numlabel_epochs
    for f in os.listdir('/data/ArgFeatModel/ModelWeights/'):
        if f.startswith('saved_weights_'+name):
            model_path = ('/data/ArgFeatModel/ModelWeights/'+f)
    loaded_model =  BertForSequenceClassification.from_pretrained('bert-base-cased',num_labels = int(name[0]))
    loaded_model.load_state_dict(torch.load(model_path))
    loaded_model.eval()
    loaded_model.to(device)
    return loaded_model

# sent preprocessing
def get_sent_argfeat(sent,tokenizer,model):
    # token IDs and attention mask for inference on the new sentence
    test_ids = []
    test_attention_mask = []
    # apply the tokenizer
    encoding = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 256,
                        padding = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )
    # extract IDs and attention mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)
    with torch.no_grad():
        output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
    # get prediction
    pred = np.argmax(output.logits.cpu().numpy()).flatten().item()
    return pred

def get_argfeat(df,series_col,model, max_sent, df_idx):
    print ('Extracting from', list_of_files[df_idx])
    series = df[series_col]
    error_list = []
    count_long = 0
    pred_list = []
    error_list = []
    for idx in range(len(series)):
        article = series.iloc[idx]
        if len(article) > max_sent:
            try:
                pred_text = [get_sent_argfeat(sent,tokenizer,model)+1 for sent in article[:max_sent]]
                count_long += 1
            except:
                print('Error line:',idx)
                error_list.append(idx)
        else:
            try:
                pred_text = [get_sent_argfeat(sent,tokenizer,model)+1 for sent in article]# + [0] * (N-len(sent_token))
            except:
                print('Error line:',idx)
                error_list.append(idx)
        pred_list.append(pred_text)
    print('long articles:',count_long,'from',len(df))
    print('percent of long articles:', count_long/(count_long+len(df)))
    flat_list = [item for sublist in pred_list for item in sublist]
    num_label = max(flat_list)
    # new column
    df[str('argfeat'+str(num_label))] = pred_list
    df = df.drop(error_list)
#    df.to_csv(list_of_files[df_idx],sep='\t',header=False,index=False)
#    print('Saved\t', list_of_files[df_idx].split('/')[-1])
    return df

# Helper function to transform test data
def process_test_df(df,af3=False,af6=False,sent=False,pos=False):
    out = []

    # argfeat   
    if af3:
        x_argfeat3 = df.iloc[:, 6]
        X_argfeat3 = padding_X(x_argfeat3)
        out.append(X_argfeat3)
    
    if af6:
        x_argfeat6 = df.iloc[:, 7]
        X_argfeat6 = padding_X(x_argfeat6)
        out.append(X_argfeat6)

    # sent_sum
    if sent:
        x_sent = df.iloc[:, 4]
        X_sent = padding_X(x_sent)
        out.append(X_sent)

    # pos count
    if pos:        
        x_pos = df.iloc[:, 5]
        x_pos_list = [] 
        for x in x_pos: 
            art_pos = pos_count_article(counter_pos(x_pos[0]),list(le.classes_)).reshape(-1,1)
            x_pos_list.append(art_pos) 
        X_pos = np.stack(x_pos_list) 
        X_pos = X_pos.reshape(X_pos.shape[0],X_pos.shape[1]) 
        out.append(X_pos)
        
    return out

In [6]:
MAXLEN= 100
PAD_VALUE = 80
MAX_SENT_PAD = 50
MAX_SENTS = MAXLEN
MAX_POS_PAD = 2000
list_of_files = publist

# transform files into dataframes
list_of_dataframes = [tokenize_input(file) for file in pub_df_list]

# get sentiment columns
print('loading sent...')
list_of_sent_dfs = [get_sentiment(df,2,df_idx) for df_idx,df in enumerate(list_of_dataframes)]

# get pos columns
print('loading pos...')
list_of_pos_dfs = [get_pos(df,2,df_idx) for df_idx,df in enumerate(list_of_dataframes)]

# specify GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# import arg-feat model
print('loading model...')
model3 = load_model("3_5")
model6 = load_model("6_3")

# get argfeat columns
print('loading argfeat3...')
list_of_argfeat3_dfs = [get_argfeat(df,2, model3, MAX_SENTS, df_idx) for df_idx,df in enumerate(list_of_dataframes)]
print('loading argfeat6...')
list_of_argfeat6_dfs = [get_argfeat(df,2, model6, MAX_SENTS, df_idx) for df_idx,df in enumerate(list_of_dataframes)]

loading sent...
loading pos...
loading model...


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

loading argfeat3...
Extracting from guardian




long articles: 1 from 100
percent of long articles: 0.009900990099009901
Extracting from foxnews




long articles: 1 from 100
percent of long articles: 0.009900990099009901
Extracting from aljazeera




long articles: 0 from 100
percent of long articles: 0.0
loading argfeat6...
Extracting from guardian




long articles: 1 from 100
percent of long articles: 0.009900990099009901
Extracting from foxnews




long articles: 1 from 100
percent of long articles: 0.009900990099009901
Extracting from aljazeera




long articles: 0 from 100
percent of long articles: 0.0


### Load model

In [8]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [9]:
loaded_model3 = keras.models.load_model("ModelWeights/af3_sent_pos.h5", custom_objects={'f1_m':f1_m}, compile=False)
loaded_model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m])
loaded_model6 = keras.models.load_model("ModelWeights/af6_sent_pos.h5", custom_objects={'f1_m':f1_m}, compile=False)
loaded_model6.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m])

2023-04-13 15:04:14.147307: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-13 15:04:14.153333: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8819 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:84:00.0, compute capability: 6.1


In [10]:
# cross-publishers
batch_size=32

for idx,df in enumerate(pub_df_list):
    
    print('Evaluating:', publist[idx])
    y_test = np.array([[0,1]]*100)
    X_af3_test, X_sent_test, X_pos_test = process_test_df(df, af3=True, sent=True, pos=True)
    
    score, acc, f1 = loaded_model3.evaluate([X_sent_test, X_pos_test, X_af3_test], y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: guardian
Test score: 0.7770951986312866
Test accuracy: 0.6800000071525574
Test f1 score: 0.7499999403953552
Evaluating: foxnews
Test score: 1.2477526664733887
Test accuracy: 0.5299999713897705
Test f1 score: 0.5156784057617188
Evaluating: aljazeera
Test score: 0.8426142930984497
Test accuracy: 0.6200000047683716
Test f1 score: 0.6015151143074036


In [11]:
# cross-publishers
batch_size=32

for idx,df in enumerate(pub_df_list):
    
    print('Evaluating:', publist[idx])
    y_test = np.array([[0,1]]*100)
    X_af6_test, X_sent_test, X_pos_test = process_test_df(df, af6=True, sent=True, pos=True)
    
    score, acc, f1 = loaded_model6.evaluate([X_sent_test, X_pos_test, X_af6_test], y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: guardian
Test score: 0.7748724222183228
Test accuracy: 0.7099999785423279
Test f1 score: 0.7684771418571472
Evaluating: foxnews
Test score: 1.103096842765808
Test accuracy: 0.6499999761581421
Test f1 score: 0.5588811635971069
Evaluating: aljazeera
Test score: 0.9227482080459595
Test accuracy: 0.6200000047683716
Test f1 score: 0.599759578704834
