***Description***
<div> This notebook displays the training of different single-feature model for the main task (subjectivity classification).
<div> In the first part, I listed all the libraries, customized functions, helper functions, etc.
<div>Then, I imported data, namely, the training data ('train_finance' - or NYTAC data on the topic of finance in the years 1996 and 2005), and testing data ('test' - or NYTAC data on 6 different topics (including 'finance') in the first three months of the year 1986). This would shade lights on whether each feature could help the model generalize cross-genres and over time.
<div> The features trained on are: 3 argumentation feautures (ArgFeat3, originally designed by Alhindi et al. 2020), 6 argumentation features (ArgFeat6), ternary sentence-level sentiment (Sent-sum), compound sentence-level sentiment (Sent-com), sentence-level POS counts (POS-count), sentence-level POS sequences as padded array (POS-pad), and article-level POS sequence (POS-seq).

In [1]:
# All packages
from ast import literal_eval
from collections import Counter
import glob, os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# keras
import keras
from keras import Input, Model
from keras import backend as K
from keras.constraints import maxnorm
from keras.models import Sequential
from keras.layers import Bidirectional, Concatenate, Embedding, Dense, Dropout, InputLayer, Reshape, SimpleRNN, GRU, BatchNormalization, TimeDistributed, Lambda, Activation, MaxPooling1D
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
from keras.utils import np_utils
# NLTK
import nltk
from nltk.data import load
from nltk import word_tokenize
from nltk import StanfordTagger
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
tagdict = load('help/tagsets/upenn_tagset.pickle')
# torch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import BertTokenizerFast, BertForSequenceClassification
from tqdm import trange

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/users/rldall/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /home/users/rldall/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [2]:
# set val_set
val_ratio = 0.2
seed = 32
maxlen = 100
batch_size = 32

In [3]:
# initialize POS label encoder
le = preprocessing.LabelEncoder()
le.fit(list(tagdict.keys()))

In [4]:
# Customized keras metrics

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [5]:
# Helper functions for POS tagging task
PAD_VALUE = 99
MAX_SENT_PAD = 50
MAX_SENTS = 100
MAX_POS_PAD = 2000

# POS padding
def pad_sent(sent):
    sent_le = le.transform(sent)
    if len(sent_le) > MAX_SENT_PAD:
        sent_le = sent_le[:MAX_SENT_PAD]
    sent_pos_padded = np.pad(np.array(sent_le), (0, MAX_SENT_PAD-len(sent_le)) , 'constant', constant_values=(PAD_VALUE))
    return sent_pos_padded

def pad_article(article):
    art_pos_pad = np.empty(shape=(MAX_SENTS, MAX_SENT_PAD))
    art_pos_pad.fill(PAD_VALUE)
    for i,sent in enumerate(article):
        if i < MAX_SENTS:
            try:
                art_pos_pad[i] = pad_sent(sent,MAX_SENT_PAD)   
            except:
                pass
    return art_pos_pad

# POS count
def counter_pos(article):
    a =[]  
    for idx,sent_pos in enumerate(article):
        count_pos = Counter(sent_pos)
        a.append(dict(count_pos))
    return a
        
def pos_count_article(counter_result, pos_index):
    article_pos_count_array = np.zeros(shape=(maxlen,len(le.classes_)))
    for art_i,sent_pos_count in enumerate(counter_result):
        if art_i >= maxlen:        
            pass
        else:
            for pos_item in sent_pos_count:
                try:
                    item_idx = pos_index.index(pos_item)
                    article_pos_count_array[art_i,item_idx] = sent_pos_count.get(pos_item)
                except:
                    pass
    return article_pos_count_array

# Import data

In [6]:
def select_files(path, startwith):
    list_of_files = []
    files = os.listdir(path)
    for file in files:
        if file.startswith(startwith):
            list_of_files.append(str(path)+str(file))
            
    return list_of_files

In [7]:
# use train_finance as the train data
list_of_train_files = select_files('/data/ProcessedNYT/','train')
train_df = pd.read_csv(list_of_train_files[2], sep='\t', header=None)
print('Training data:', list_of_train_files[2])

# use 1986 data as test data
list_of_files = select_files('/data/ProcessedNYT/','test')
list_of_dfs = [pd.read_csv(file, sep='\t', header=None) for file in list_of_files]

Training data: /data/ProcessedNYT/train_finance.txt


In [7]:
# use all_finance as the train data
list_of_files = select_files('/data/ProcessedNYT/','all')
train_df = pd.read_csv(list_of_files[2], sep='\t', header=None)
print('Training data:', list_of_files[2])

# use all_* data as test data
list_of_dfs = [pd.read_csv(file, sep='\t', header=None) for file in list_of_files]

Training data: /data/ProcessedNYT/all_finance.txt


# First Model: Reproduction: ArgFeat3

In [8]:
# load and define data
labels = train_df[0].values
labels = pd.get_dummies(labels).to_numpy()
X_argfeat3 = train_df[6].apply(literal_eval)
max_features = 3

print('Loading data...')
X_train, X_test, y_train, y_test = train_test_split(X_argfeat3, labels, test_size=val_ratio, random_state=seed)

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)#, dtype="int32", truncating="pre")
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

Loading data...
2226 train sequences
557 test sequences
Pad sequences (samples x time)
x_train shape: (2226, 100)
x_test shape: (557, 100)


In [9]:
# build model
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(SimpleRNN(128, dropout=0.2)) #, recurrent_dropout=0.5
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy',f1_m])

model.summary()

Build model...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         384       
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               32896     
                                                                 
 dense (Dense)               (None, 2)                 258       
                                                                 
Total params: 33,538
Trainable params: 33,538
Non-trainable params: 0
_________________________________________________________________


2023-04-13 10:54:05.249429: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-13 10:54:06.118913: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10413 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:84:00.0, compute capability: 6.1


In [10]:
# train, validate, save
print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(X_test, y_test))

model.save("ModelWeights/af3_reproduction.h5")

score, acc, f1 = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('Test f1 score:', f1)

Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.24883030354976654
Test accuracy: 0.8922800421714783
Test f1 score: 0.8915017247200012


In [11]:
# evaluating the model, using data from different topics/year
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    
    labels = df[0].values
    labels = pd.get_dummies(labels).to_numpy()
    X_argfeat3 = df[6].apply(literal_eval)
    X_test = sequence.pad_sequences(X_argfeat3, maxlen=maxlen)
    
    score, acc, f1 = model.evaluate(X_test, labels, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/test_military.txt
Test score: 0.1898823380470276
Test accuracy: 0.9355372190475464
Test f1 score: 0.9301140308380127
Evaluating: /data/ProcessedNYT/test_law.txt
Test score: 0.2778244912624359
Test accuracy: 0.8836065530776978
Test f1 score: 0.8885301351547241
Evaluating: /data/ProcessedNYT/test_finance.txt
Test score: 0.20292405784130096
Test accuracy: 0.9372549057006836
Test f1 score: 0.9314201474189758
Evaluating: /data/ProcessedNYT/test_education.txt
Test score: 0.1970166712999344
Test accuracy: 0.9425675868988037
Test f1 score: 0.9453365206718445
Evaluating: /data/ProcessedNYT/test_politics.txt
Test score: 0.24012404680252075
Test accuracy: 0.9090247750282288
Test f1 score: 0.9075931310653687
Evaluating: /data/ProcessedNYT/test_medicine.txt
Test score: 0.2881244719028473
Test accuracy: 0.867986798286438
Test f1 score: 0.8718429803848267


# Extended Model: ArgFeat6

In [13]:
# load and define data
labels = train_df[0].values
labels = pd.get_dummies(labels).to_numpy()
X_argfeat6 = train_df[7].apply(literal_eval)
max_features = 6

print('Loading data...')
X_train, X_test, y_train, y_test = train_test_split(X_argfeat6, labels, test_size=val_ratio, random_state=seed) ######

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)#, dtype="int32", truncating="pre")
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

Loading data...
2226 train sequences
557 test sequences
Pad sequences (samples x time)
x_train shape: (2226, 100)
x_test shape: (557, 100)


In [14]:
# build model
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(SimpleRNN(128, dropout=0.2)) #, recurrent_dropout=0.5
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy',f1_m])

model.summary()

Build model...
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 128)         768       
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 2)                 258       
                                                                 
Total params: 33,922
Trainable params: 33,922
Non-trainable params: 0
_________________________________________________________________


In [15]:
# train, validate, save
print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(X_test, y_test))

model.save("ModelWeights/af6.h5")

score, acc, f1 = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('Test f1 score:', f1)

Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.2055189460515976
Test accuracy: 0.9102333784103394
Test f1 score: 0.9147651195526123


In [17]:
# evaluating the model, using data from different topics/year
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    
    labels = df[0].values
    labels = pd.get_dummies(labels).to_numpy()
    X_argfeat6 = df[6].apply(literal_eval) 
    X_test = sequence.pad_sequences(X_argfeat6, maxlen=maxlen) 
    
    score, acc, f1 = model.evaluate(X_test, labels, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/test_military.txt
Test score: 0.4557538628578186
Test accuracy: 0.8991735577583313
Test f1 score: 0.899330735206604
Evaluating: /data/ProcessedNYT/test_law.txt
Test score: 0.9221323728561401
Test accuracy: 0.7868852615356445
Test f1 score: 0.7968749403953552
Evaluating: /data/ProcessedNYT/test_finance.txt
Test score: 0.5490012764930725
Test accuracy: 0.8745098114013672
Test f1 score: 0.8742187023162842
Evaluating: /data/ProcessedNYT/test_education.txt
Test score: 0.7831103205680847
Test accuracy: 0.8175675868988037
Test f1 score: 0.8124998807907104
Evaluating: /data/ProcessedNYT/test_politics.txt
Test score: 0.6692087054252625
Test accuracy: 0.8486171960830688
Test f1 score: 0.8485463857650757
Evaluating: /data/ProcessedNYT/test_medicine.txt
Test score: 0.7594481110572815
Test accuracy: 0.8052805066108704
Test f1 score: 0.8049999475479126


# Extended Model: Sent_sum

In [25]:
# load and define data
X_sent_sum = train_df[4].apply(literal_eval)
labels = train_df[0].values
labels = pd.get_dummies(labels).to_numpy()
max_features = 3

print('Loading data...')
X_train, X_test, y_train, y_test = train_test_split(X_sent_sum, labels, test_size=val_ratio, random_state=seed)

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)#, dtype="int32", truncating="pre")
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

Loading data...
2520 train sequences
631 test sequences
Pad sequences (samples x time)
x_train shape: (2520, 100)
x_test shape: (631, 100)


In [26]:
# build model
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(GRU(128, dropout=0.2)) #, recurrent_dropout=0.5
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy',f1_m])

model.summary()

Build model...
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         384       
                                                                 
 gru (GRU)                   (None, 128)               99072     
                                                                 
 dense_4 (Dense)             (None, 2)                 258       
                                                                 
Total params: 99,714
Trainable params: 99,714
Non-trainable params: 0
_________________________________________________________________


In [27]:
# train, validate, save
print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(X_test, y_test))

model.save("ModelWeights/sent-sum_GRU.h5")

score, acc, f1 = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('Test f1 score:', f1)

Train...
Epoch 1/5


2023-04-09 14:50:33.064704: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8200


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.421027272939682
Test accuracy: 0.8209191560745239
Test f1 score: 0.8193548917770386


In [28]:
# evaluating the model, using data from different topics/year
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    
    labels = df[0].values
    labels = pd.get_dummies(labels).to_numpy()
    X_sent_sum = df[4].apply(literal_eval)    
    X_test = sequence.pad_sequences(X_sent_sum, maxlen=maxlen)
    
    score, acc, f1 = model.evaluate(X_test, labels, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/all_medicine.txt
Test score: 0.45168182253837585
Test accuracy: 0.7637178301811218
Test f1 score: 0.7636842727661133
Evaluating: /data/ProcessedNYT/all_education.txt
Test score: 0.44678205251693726
Test accuracy: 0.7751572132110596
Test f1 score: 0.774088442325592
Evaluating: /data/ProcessedNYT/all_finance.txt
Test score: 0.3947012424468994
Test accuracy: 0.8317994475364685
Test f1 score: 0.8306416869163513
Evaluating: /data/ProcessedNYT/all_law.txt
Test score: 0.5004138946533203
Test accuracy: 0.7566889524459839
Test f1 score: 0.7573893070220947
Evaluating: /data/ProcessedNYT/all_military.txt
Test score: 0.4715084135532379
Test accuracy: 0.7913801074028015
Test f1 score: 0.7913931608200073
Evaluating: /data/ProcessedNYT/all_politics.txt
Test score: 0.4878956973552704
Test accuracy: 0.7841346859931946
Test f1 score: 0.7837513089179993


# Extended Model: Sent_compound

In [29]:
# load and define data
X_sent_com = train_df[3].apply(literal_eval)
labels = train_df[0].values
labels = pd.get_dummies(labels).to_numpy()

print('Loading data...')
X_train, X_test, y_train, y_test = train_test_split(X_sent_com, labels, test_size=val_ratio, random_state=seed) 

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen, dtype = np.float32)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, dtype = np.float32)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

Loading data...
2520 train sequences
631 test sequences
Pad sequences (samples x time)
x_train shape: (2520, 100)
x_test shape: (631, 100)


In [30]:
# build model
print('Build model...')
model = Sequential()
model.add(Dense(128, activation='relu', input_shape = (100, ))) #input shape as 100
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1_m])

model.summary()

Build model...
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 128)               12928     
                                                                 
 dense_6 (Dense)             (None, 2)                 258       
                                                                 
Total params: 13,186
Trainable params: 13,186
Non-trainable params: 0
_________________________________________________________________


In [31]:
# train, validate, save
print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(X_test, y_test))

model.save("ModelWeights/sent-com.h5")

score, acc, f1 = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('Test f1 score:', f1)

Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.44856879115104675
Test accuracy: 0.8082408905029297
Test f1 score: 0.8084918260574341


In [32]:
# evaluating the model, using data from different topics/year
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    
    labels = df[0].values
    labels = pd.get_dummies(labels).to_numpy()
    X_sent_com = df[3].apply(literal_eval)
    X_sent_com = [np.array(x) for x in X_sent_com]
    
    X_test = sequence.pad_sequences(X_sent_com, maxlen=maxlen, dtype = np.float32)
    X_test = X_test.reshape(len(X_test),maxlen,1)    
    
    score, acc, f1 = model.evaluate(X_test, labels, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/all_medicine.txt
Test score: 0.512866735458374
Test accuracy: 0.7368420958518982
Test f1 score: 0.7372080087661743
Evaluating: /data/ProcessedNYT/all_education.txt
Test score: 0.5157416462898254
Test accuracy: 0.7295597195625305
Test f1 score: 0.7293749451637268
Evaluating: /data/ProcessedNYT/all_finance.txt
Test score: 0.4053676426410675
Test accuracy: 0.8235480785369873
Test f1 score: 0.8234216570854187
Evaluating: /data/ProcessedNYT/all_law.txt
Test score: 0.5563758611679077
Test accuracy: 0.7352285385131836
Test f1 score: 0.7372786998748779
Evaluating: /data/ProcessedNYT/all_military.txt
Test score: 0.49473652243614197
Test accuracy: 0.7982577085494995
Test f1 score: 0.8007245659828186
Evaluating: /data/ProcessedNYT/all_politics.txt
Test score: 0.5210676193237305
Test accuracy: 0.7772863507270813
Test f1 score: 0.778266966342926


# Extended Model: POS counts

In [41]:
# load and define data
labels = train_df[0].values
labels = pd.get_dummies(labels).to_numpy()
x_pos = train_df[5].apply(literal_eval)
max_features = len(le.classes_)

# extra-pre-processing step
X_pos = []
for x in x_pos:
    art_pos = pos_count_article(counter_pos(x_pos[0]),list(le.classes_)).reshape(-1,1)
    X_pos.append(art_pos)
X_pos = np.stack(X_pos)

print('Loading data...')
X_train, X_test, y_train, y_test = train_test_split(X_pos, labels, test_size=val_ratio, random_state=seed) 

print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

Loading data...
x_train shape: (2520, 4500, 1)
x_test shape: (631, 4500, 1)


In [42]:
# build model
print('Build model...')
model = Sequential()
model.add(Dense(128, activation='relu', input_shape = (4500, )))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1_m])

model.summary()

Build model...
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 128)               576128    
                                                                 
 dense_11 (Dense)            (None, 2)                 258       
                                                                 
Total params: 576,386
Trainable params: 576,386
Non-trainable params: 0
_________________________________________________________________


In [43]:
# train, validate, save
print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(X_test,y_test))

model.save("ModelWeights/pos-count.h5")

score, acc, f1 = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('Test f1 score:', f1)

Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.4937295913696289
Test accuracy: 0.8082408905029297
Test f1 score: 0.8084918260574341


In [44]:
# evaluating the model, using data from different topics/year
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    labels = df[0].values
    labels = pd.get_dummies(labels).to_numpy()
    X_pos = df[5].apply(literal_eval)

    X_pos_test = []
    for x in X_pos:
        art_pos = pos_count_article(counter_pos(X_pos[0]),list(le.classes_)).reshape(-1,1)
        X_pos_test.append(art_pos)
    X_pos_test = np.stack(X_pos_test)
    
    score, acc, f1 = model.evaluate(X_pos_test, labels, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/all_medicine.txt
Test score: 0.5821709632873535
Test accuracy: 0.7368420958518982
Test f1 score: 0.7372080087661743
Evaluating: /data/ProcessedNYT/all_education.txt
Test score: 0.5914919376373291
Test accuracy: 0.7295597195625305
Test f1 score: 0.7293749451637268
Evaluating: /data/ProcessedNYT/all_finance.txt
Test score: 0.46880000829696655
Test accuracy: 0.8235480785369873
Test f1 score: 0.8234216570854187
Evaluating: /data/ProcessedNYT/all_law.txt
Test score: 0.6323300004005432
Test accuracy: 0.7352285385131836
Test f1 score: 0.7372786998748779
Evaluating: /data/ProcessedNYT/all_military.txt
Test score: 0.5041810274124146
Test accuracy: 0.7982577085494995
Test f1 score: 0.8007245659828186
Evaluating: /data/ProcessedNYT/all_politics.txt
Test score: 0.5588778257369995
Test accuracy: 0.7772863507270813
Test f1 score: 0.778266966342926


# Extension: POS sequence-padded

In [37]:
# load and define data
labels = train_df[0].values
labels = pd.get_dummies(labels).to_numpy()
x_pos = train_df[5].apply(literal_eval)
max_features = len(le.classes_)

# extra-pre-processing step
X_pos = []
for x in x_pos:
    art_pos = pad_article(x).reshape(-1,1)
    X_pos.append(art_pos)
X_pos = np.stack(X_pos)

print('Loading data...')
X_train, X_test, y_train, y_test = train_test_split(X_pos, labels, test_size=val_ratio, random_state=seed) 

print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

Loading data...
x_train shape: (2520, 5000, 1)
x_test shape: (631, 5000, 1)


In [38]:
# build model
print('Build model...')
model = Sequential()
#model.add(Dense(128, activation='relu', input_shape = (5000, )))
model.add(Embedding(45, 128, input_length=5000))
#model.add(Embedding(max_features, 128))
model.add(SimpleRNN(128, dropout=0.2)) #, recurrent_dropout=0.5
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1_m])

model.summary()

Build model...
Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 5000, 128)         5760      
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               32896     
                                                                 
 dense_9 (Dense)             (None, 2)                 258       
                                                                 
Total params: 38,914
Trainable params: 38,914
Non-trainable params: 0
_________________________________________________________________


In [39]:
# train, validate, save
print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(X_test,y_test))

model.save("ModelWeights/pos-padded.h5")

score, acc, f1 = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('Test f1 score:', f1)

Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.4889063537120819
Test accuracy: 0.8082408905029297
Test f1 score: 0.8084918260574341


In [40]:
# evaluating the model, using data from different topics/year
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    
    labels = df[0].values
    labels = pd.get_dummies(labels).to_numpy()
    X_pos = df[5].apply(literal_eval)

    X_pos_test = []

    for x in X_pos:
        art_pos = pad_article(x).reshape(-1,1)
        X_pos_test.append(art_pos)
    X_pos_test = np.stack(X_pos_test)   
    
    score, acc, f1 = model.evaluate(X_pos_test, labels, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/all_medicine.txt
Test score: 0.5898562073707581
Test accuracy: 0.7368420958518982
Test f1 score: 0.7372080087661743
Evaluating: /data/ProcessedNYT/all_education.txt
Test score: 0.6001524925231934
Test accuracy: 0.7295597195625305
Test f1 score: 0.7293749451637268
Evaluating: /data/ProcessedNYT/all_finance.txt
Test score: 0.46726369857788086
Test accuracy: 0.8235480785369873
Test f1 score: 0.8234216570854187
Evaluating: /data/ProcessedNYT/all_law.txt
Test score: 0.5921376347541809
Test accuracy: 0.7352285385131836
Test f1 score: 0.7372786998748779
Evaluating: /data/ProcessedNYT/all_military.txt
Test score: 0.5030216574668884
Test accuracy: 0.7982577085494995
Test f1 score: 0.8007245659828186
Evaluating: /data/ProcessedNYT/all_politics.txt
Test score: 0.532672643661499
Test accuracy: 0.7772863507270813
Test f1 score: 0.778266966342926


# Extension: POS sequence

In [8]:
# load and define data
labels = train_df[0]
x_pos = train_df[5].apply(literal_eval)
max_features = len(le.classes_)

# extra pre-processing step
N = 2000
error = []
x_pos_list = []
for i,x in enumerate(x_pos):
    flatten = [item for sublist in x for item in sublist]
    try:
        flatten = le.transform(flatten)
        if len(flatten) < N:
            x = np.concatenate([flatten,np.zeros(N - len(flatten))])
        else:
            x = flatten[:N]
        x = np.array(x).reshape(-1,1)
        x_pos_list.append(x) 
    except:
        error.append(i)
labels = labels.drop(labels.index[error])
X_pos = np.stack(x_pos_list) 
labels = pd.get_dummies(labels).to_numpy()

print('Loading data...')
X_train, X_test, y_train, y_test = train_test_split(X_pos, labels, test_size=val_ratio, random_state=seed) 

print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

Loading data...
x_train shape: (2508, 2000, 1)
x_test shape: (628, 2000, 1)


In [9]:
# build model
print('Build model...')
model = Sequential()
#model.add(Dense(128, activation='relu', input_shape = (2000, )))
model.add(Embedding(max_features, 128))
model.add(SimpleRNN(128, dropout=0.2)) #, recurrent_dropout=0.5
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1_m])

model.summary()

Build model...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         5760      
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               32896     
                                                                 
 dense (Dense)               (None, 2)                 258       
                                                                 
Total params: 38,914
Trainable params: 38,914
Non-trainable params: 0
_________________________________________________________________


2023-04-09 15:44:52.361579: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-09 15:44:53.139896: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10413 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:84:00.0, compute capability: 6.1


In [10]:
# train, validate, save
print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(X_test,y_test))

model.save("ModelWeights/pos-seq.h5")

score, acc, f1 = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('Test f1 score:', f1)

Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.4864695370197296
Test accuracy: 0.8121019005775452
Test f1 score: 0.8128124475479126


In [11]:
# evaluating the model, using data from different topics/year
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])

    labels = df[0]
    x_pos = df[5].apply(literal_eval)    
    x_pos_list = [] 
    error = []

    for i,x in enumerate(x_pos):
        flatten = [item for sublist in x for item in sublist]
        try:
            flatten = le.transform(flatten)
            if len(flatten) < N:
                x = np.concatenate([flatten,np.zeros(N - len(flatten))])
            else:
                x = flatten[:N]
            x = np.array(x).reshape(-1,1)
            x_pos_list.append(x) 
        except:
            error.append(i)

    labels = labels.drop(labels.index[error])
    X_pos_test = np.stack(x_pos_list)       
    labels = pd.get_dummies(labels).to_numpy()

    score, acc, f1 = model.evaluate(X_pos_test, labels, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/all_medicine.txt
Test score: 0.5808092951774597
Test accuracy: 0.7361032962799072
Test f1 score: 0.7368462681770325
Evaluating: /data/ProcessedNYT/all_education.txt
Test score: 0.5890154838562012
Test accuracy: 0.7289915680885315
Test f1 score: 0.7286457419395447
Evaluating: /data/ProcessedNYT/all_finance.txt
Test score: 0.47263115644454956
Test accuracy: 0.8227040767669678
Test f1 score: 0.822704017162323
Evaluating: /data/ProcessedNYT/all_law.txt
Test score: 0.5840604305267334
Test accuracy: 0.7336697578430176
Test f1 score: 0.7343005537986755
Evaluating: /data/ProcessedNYT/all_military.txt
Test score: 0.5059718489646912
Test accuracy: 0.796570897102356
Test f1 score: 0.79825359582901
Evaluating: /data/ProcessedNYT/all_politics.txt
Test score: 0.5315024256706238
Test accuracy: 0.7762969136238098
Test f1 score: 0.7772545218467712
