***Description***

<div> This notebook displays the training of different combined-feature model for the main task (subjectivity classification).
<div> In the first part, I listed all the libraries, customized functions, helper functions, etc.
<div>Then, I imported data, namely, the training data ('train_finance' - or NYTAC data on the topic of finance in the years 1996 and 2005), and testing data ('test' - or NYTAC data on 6 different topics (including 'finance') in the first three months of the year 1986). This would shade lights on whether each feature could help the model generalize cross-genres and over time.
<div> The features trained on are: 3 argumentation feautures (ArgFeat3, originally designed by Alhindi et al. 2020), 6 argumentation features (ArgFeat6), ternary sentence-level sentiment (SentSum), sentence-level POS counts (POSCount). The combination of the features are: ArgFeat3+SentSum, ArgFeat3+POSCount, ArgFeat6+SentSum, ArgFeat6+POSCount, SentSum+ POSCount, ArgFeat3+SentSum+POSCount, and ArgFeat6+SentSum+POSCount.

https://pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/

In [2]:
# All packages
from ast import literal_eval
from collections import Counter
import glob, os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# keras
import keras
from keras import Input, Model
from keras import backend as K
from keras.constraints import maxnorm
from keras.models import Sequential
from keras.layers import Bidirectional, Concatenate, Embedding, Dense, Dropout, InputLayer, Reshape, SimpleRNN, BatchNormalization, TimeDistributed, Lambda, Activation, MaxPooling1D
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
from keras.utils import np_utils
# NLTK
import nltk
from nltk.data import load
from nltk import word_tokenize
from nltk import StanfordTagger
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
tagdict = load('help/tagsets/upenn_tagset.pickle')
# torch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import BertTokenizerFast, BertForSequenceClassification
from tqdm import trange

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/users/rldall/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /home/users/rldall/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [3]:
# set val_set
val_ratio = 0.2
seed = 32
maxlen = 100
batch_size = 32

In [4]:
# initialize POS label encoder
le = preprocessing.LabelEncoder()
le.fit(list(tagdict.keys()))

In [5]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [6]:
# Helper functions for POS tagging task
# POS count
def counter_pos(article):
    a =[]  
    for idx,sent_pos in enumerate(article):
        count_pos = Counter(sent_pos)
        a.append(dict(count_pos))
    return a
        
def pos_count_article(counter_result, pos_index):
    article_pos_count_array = np.zeros(shape=(maxlen,len(le.classes_)))
    for art_i,sent_pos_count in enumerate(counter_result):
        if art_i >= maxlen:        
            pass
        else:
            for pos_item in sent_pos_count:
                try:
                    item_idx = pos_index.index(pos_item)
                    article_pos_count_array[art_i,item_idx] = sent_pos_count.get(pos_item)
                except:
                    pass
    return article_pos_count_array

# Helper function for padding
def padding_X(X):    
    return sequence.pad_sequences(X, maxlen=100)

In [7]:
# Helper function to transform test data
def process_test_df(df,af3=False,af6=False,sent=False,pos=False):
    
    out = []

    # labels
    labels = df[label_col].values
    labels = pd.get_dummies(labels).to_numpy()
    out.append(labels)

    # argfeat
    if af3:
        x_argfeat3 = df[argfeat3_col].apply(literal_eval)
        X_argfeat3 = padding_X(x_argfeat3)
        out.append(X_argfeat3)
    
    if af6:
        x_argfeat6 = df[argfeat6_col].apply(literal_eval)
        X_argfeat6 = padding_X(x_argfeat6)
        out.append(X_argfeat6)

    # sent_sum
    if sent:
        x_sent = df[sentsum_col].apply(literal_eval)
        X_sent = padding_X(x_sent)
        out.append(X_sent)

    # pos count
    if pos:        
        x_pos = df[pos_col].apply(literal_eval)
        x_pos_list = [] 
        for x in x_pos: 
            art_pos = pos_count_article(counter_pos(x_pos[0]),list(le.classes_)).reshape(-1,1)
            x_pos_list.append(art_pos) 
        X_pos = np.stack(x_pos_list) 
        X_pos = X_pos.reshape(X_pos.shape[0],X_pos.shape[1]) 
        out.append(X_pos)
        
    return out

# Import data

In [7]:
def select_files(path, startwith):
    list_of_files = []
    files = os.listdir(path)
    for file in files:
        if file.startswith(startwith):
            list_of_files.append(str(path)+str(file))
    return list_of_files

In [8]:
# use train_finance as the train data
list_of_train_files = select_files('/data/ProcessedNYT/','train')
train_df = pd.read_csv(list_of_train_files[2], sep='\t', header=None)
print ('Training data:', list_of_train_files[2])

# use 1986 data as test data
list_of_files = select_files('/data/ProcessedNYT/','test')
list_of_dfs = [pd.read_csv(file, sep='\t', header=None) for file in list_of_files]

Training data: /data/ProcessedNYT/train_finance.txt


In [8]:
# use all_finance as the train data
list_of_files = select_files('/data/ProcessedNYT/','all')
train_df = pd.read_csv(list_of_files[2], sep='\t', header=None)
print ('Training data:', list_of_files[2])

# use 1986 data as test data
list_of_dfs = [pd.read_csv(file, sep='\t', header=None) for file in list_of_files]

Training data: /data/ProcessedNYT/all_finance.txt


# Transform training data

In [9]:
# column definition
label_col = 0
argfeat3_col = 6
argfeat6_col = 7
sentsum_col = 4
pos_col = 5

# labels
labels = train_df[label_col].values
labels = pd.get_dummies(labels).to_numpy()

# argfeat
x_argfeat3 = train_df[argfeat3_col].apply(literal_eval)
X_argfeat3 = padding_X(x_argfeat3)
x_argfeat6 = train_df[argfeat6_col].apply(literal_eval)
X_argfeat6 = padding_X(x_argfeat6)

# sent_sum
x_sent = train_df[sentsum_col].apply(literal_eval)
X_sent = padding_X(x_sent)

# raw_pos
x_pos = train_df[pos_col].apply(literal_eval)
x_pos_list = []
for x in x_pos: 
    art_pos = pos_count_article(counter_pos(x_pos[0]),list(le.classes_)).reshape(-1,1)
    x_pos_list.append(art_pos) 
X_pos = np.stack(x_pos_list) 
X_pos = X_pos.reshape(X_pos.shape[0],X_pos.shape[1]) 

# split data
y_train, y_val = train_test_split(labels, test_size=val_ratio, random_state=seed)
X_af3_train, X_af3_val = train_test_split(X_argfeat3, test_size=val_ratio, random_state=seed)
X_af6_train, X_af6_val = train_test_split(X_argfeat6, test_size=val_ratio, random_state=seed)
X_sent_train, X_sent_val = train_test_split(X_sent, test_size=val_ratio, random_state=seed)
X_pos_train, X_pos_val = train_test_split(X_pos, test_size=val_ratio, random_state=seed)

# Model Combination

## Combined model: argfeat3 + sent_sum

In [10]:
input_sent = Input(shape=(X_sent.shape[1],))
model_sent = Embedding(3, 128)(input_sent)
model_sent = SimpleRNN(128, dropout=0.2)(model_sent)

input_af3 = Input(shape=(X_argfeat3.shape[1],))
model_af3 = Embedding(3, 128)(input_af3)
model_af3 = SimpleRNN(128, dropout=0.2)(model_af3)

merged = Concatenate()([model_sent, model_af3])
dense_pred = (Dense(2, activation='sigmoid'))(merged)

model = Model(inputs=[input_sent, input_af3], outputs=dense_pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m])
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 100, 128)     384         ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 100, 128)     384         ['input_2[0][0]']                
                                                                                              

2023-04-09 19:51:02.571818: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-09 19:51:03.331216: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10413 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:84:00.0, compute capability: 6.1


In [11]:
# training
model.fit([X_sent_train,X_af3_train],np.array(y_train),
          batch_size=batch_size,
          epochs=5,
          validation_data=([X_sent_val,X_af3_val],np.array(y_val)))

model.save("af3_sent.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    y_test, X_af3_test, X_sent_test = process_test_df(df, af3=True, sent=True)
    
    score, acc, f1 = model.evaluate([X_sent_test,X_af3_test], y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/test_military.txt
Test score: 0.17975829541683197
Test accuracy: 0.9454545378684998
Test f1 score: 0.9466598629951477
Evaluating: /data/ProcessedNYT/test_law.txt
Test score: 0.2846507728099823
Test accuracy: 0.8737704753875732
Test f1 score: 0.8824936151504517
Evaluating: /data/ProcessedNYT/test_finance.txt
Test score: 0.1883891522884369
Test accuracy: 0.9235293865203857
Test f1 score: 0.9247795343399048
Evaluating: /data/ProcessedNYT/test_education.txt
Test score: 0.23138388991355896
Test accuracy: 0.9087837934494019
Test f1 score: 0.9047595858573914
Evaluating: /data/ProcessedNYT/test_politics.txt
Test score: 0.21768297255039215
Test accuracy: 0.9148471355438232
Test f1 score: 0.9145348072052002
Evaluating: /data/ProcessedNYT/test_medicine.txt
Test score: 0.3025854825973511
Test accuracy: 0.8646864891052246
Test f1 score: 0.866161048412323


## Combined model: argfeat6 + sent_sum

In [15]:
input_sent = Input(shape=(X_sent.shape[1],))
model_sent = Embedding(3, 128)(input_sent)
model_sent = SimpleRNN(128, dropout=0.2)(model_sent)

input_af6 = Input(shape=(X_argfeat6.shape[1],))
model_af6 = Embedding(6, 128)(input_af6)
model_af6 = SimpleRNN(128, dropout=0.2)(model_af6)

merged = Concatenate()([model_sent, model_af6])
dense_pred = (Dense(2, activation='sigmoid'))(merged)

model = Model(inputs=[input_sent, input_af6], outputs=dense_pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m])
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 100, 128)     384         ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 100, 128)     768         ['input_4[0][0]']                
                                                                                            

In [16]:
# training
model.fit([X_sent_train,X_af6_train],np.array(y_train),
          batch_size=batch_size,
          epochs=5,
          validation_data=([X_sent_val,X_af6_val],np.array(y_val)))

model.save("af6_sent.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    y_test, X_af6_test, X_sent_test = process_test_df(df, af6=True, sent=True)
    
    score, acc, f1 = model.evaluate([X_sent_test,X_af6_test], y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/test_military.txt
Test score: 0.15713836252689362
Test accuracy: 0.9504132270812988
Test f1 score: 0.9480684995651245
Evaluating: /data/ProcessedNYT/test_law.txt
Test score: 0.2590218186378479
Test accuracy: 0.8836065530776978
Test f1 score: 0.8857123255729675
Evaluating: /data/ProcessedNYT/test_finance.txt
Test score: 0.17684735357761383
Test accuracy: 0.9372549057006836
Test f1 score: 0.938399076461792
Evaluating: /data/ProcessedNYT/test_education.txt
Test score: 0.17644309997558594
Test accuracy: 0.9391891956329346
Test f1 score: 0.9437500238418579
Evaluating: /data/ProcessedNYT/test_politics.txt
Test score: 0.1996869295835495
Test accuracy: 0.9206695556640625
Test f1 score: 0.918850302696228
Evaluating: /data/ProcessedNYT/test_medicine.txt
Test score: 0.24100108444690704
Test accuracy: 0.9009901285171509
Test f1 score: 0.9044642448425293


## Combined model: argfeat3 + pos_count

In [18]:
input_pos = Input(shape=(X_pos.shape[1],))
model_pos = Embedding(len(le.classes_), 128)(input_pos)
model_pos = SimpleRNN(128, dropout=0.2)(model_pos)

input_af3 = Input(shape=(X_argfeat3.shape[1],))
model_af3 = Embedding(3, 128)(input_af3)
model_af3 = SimpleRNN(128, dropout=0.2)(model_af3)

merged = Concatenate()([model_pos, model_af3])
dense_pred = (Dense(2, activation='sigmoid'))(merged)

model = Model(inputs=[input_pos, input_af3], outputs=dense_pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m])
print(model.summary())

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 4500)]       0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 4500, 128)    5760        ['input_5[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, 100, 128)     384         ['input_6[0][0]']                
                                                                                            

In [19]:
# training
model.fit([X_pos_train,X_af3_train],np.array(y_train),
          batch_size=batch_size,
          epochs=5,
          validation_data=([X_pos_val,X_af3_val],np.array(y_val)))

model.save("af3_pos.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    y_test, X_af3_test, X_pos_test = process_test_df(df, af3=True, pos=True)
    
    score, acc, f1 = model.evaluate([X_pos_test,X_af3_test], y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/test_military.txt
Test score: 0.16792236268520355
Test accuracy: 0.9537190198898315
Test f1 score: 0.9531376361846924
Evaluating: /data/ProcessedNYT/test_law.txt
Test score: 0.2733813524246216
Test accuracy: 0.8868852257728577
Test f1 score: 0.8925479650497437
Evaluating: /data/ProcessedNYT/test_finance.txt
Test score: 0.18869280815124512
Test accuracy: 0.9254902005195618
Test f1 score: 0.9277416467666626
Evaluating: /data/ProcessedNYT/test_education.txt
Test score: 0.2323516607284546
Test accuracy: 0.8952702879905701
Test f1 score: 0.8899038434028625
Evaluating: /data/ProcessedNYT/test_politics.txt
Test score: 0.20867164433002472
Test accuracy: 0.9264919757843018
Test f1 score: 0.9248571395874023
Evaluating: /data/ProcessedNYT/test_medicine.txt
Test score: 0.30896636843681335
Test accuracy: 0.8580858111381531
Test f1 score: 0.8657852411270142


## Combined model: argfeat6 + pos_count

In [21]:
input_pos = Input(shape=(X_pos.shape[1],))
model_pos = Embedding(len(le.classes_), 128)(input_pos)
model_pos = SimpleRNN(128, dropout=0.2)(model_pos)

input_af6 = Input(shape=(X_argfeat6.shape[1],))
model_af6 = Embedding(6, 128)(input_af6)
model_af6 = SimpleRNN(128, dropout=0.2)(model_af6)

merged = Concatenate()([model_pos, model_af6])
dense_pred = (Dense(2, activation='sigmoid'))(merged)

model = Model(inputs=[input_pos, input_af6], outputs=dense_pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m])
print(model.summary())

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 4500)]       0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding_6 (Embedding)        (None, 4500, 128)    5760        ['input_7[0][0]']                
                                                                                                  
 embedding_7 (Embedding)        (None, 100, 128)     768         ['input_8[0][0]']                
                                                                                            

In [22]:
# training
model.fit([X_pos_train,X_af6_train],np.array(y_train),
          batch_size=batch_size,
          epochs=5,
          validation_data=([X_pos_val,X_af6_val],np.array(y_val)))

model.save("af6_pos.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    y_test, X_af6_test, X_pos_test = process_test_df(df, af6=True, pos=True)
    
    score, acc, f1 = model.evaluate([X_pos_test,X_af6_test], y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/test_military.txt
Test score: 0.15697097778320312
Test accuracy: 0.9487603306770325
Test f1 score: 0.9475938081741333
Evaluating: /data/ProcessedNYT/test_law.txt
Test score: 0.3073975741863251
Test accuracy: 0.8524590134620667
Test f1 score: 0.8598685264587402
Evaluating: /data/ProcessedNYT/test_finance.txt
Test score: 0.22637878358364105
Test accuracy: 0.8921568393707275
Test f1 score: 0.8948733806610107
Evaluating: /data/ProcessedNYT/test_education.txt
Test score: 0.270818293094635
Test accuracy: 0.8783783912658691
Test f1 score: 0.8763877153396606
Evaluating: /data/ProcessedNYT/test_politics.txt
Test score: 0.21589882671833038
Test accuracy: 0.9090247750282288
Test f1 score: 0.9099516868591309
Evaluating: /data/ProcessedNYT/test_medicine.txt
Test score: 0.32931947708129883
Test accuracy: 0.8580858111381531
Test f1 score: 0.8696168065071106


## Combined model: sent_sum + pos_count

In [24]:
input_sent = Input(shape=(X_sent.shape[1],))
model_sent = Embedding(3, 128)(input_sent)
model_sent = SimpleRNN(128, dropout=0.2)(model_sent)

input_pos = Input(shape=(X_pos.shape[1],))
model_pos = Embedding(len(le.classes_), 128)(input_pos)
model_pos = SimpleRNN(128, dropout=0.2)(model_pos)

merged = Concatenate()([model_sent, model_pos])
dense_pred = (Dense(2, activation='sigmoid'))(merged)

model = Model(inputs=[input_sent, input_pos], outputs=dense_pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m])
print(model.summary())

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_10 (InputLayer)          [(None, 4500)]       0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, 100, 128)     384         ['input_9[0][0]']                
                                                                                                  
 embedding_9 (Embedding)        (None, 4500, 128)    5760        ['input_10[0][0]']               
                                                                                            

In [25]:
# training
model.fit([X_sent_train, X_pos_train],np.array(y_train),
          batch_size=batch_size,
          epochs=5,
          validation_data=([X_sent_val, X_pos_val],np.array(y_val)))

model.save("sent_pos.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    y_test, X_sent_test, X_pos_test = process_test_df(df, sent = True, pos=True)
    
    score, acc, f1 = model.evaluate([X_sent_test, X_pos_test], y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/test_military.txt
Test score: 0.33232590556144714
Test accuracy: 0.8991735577583313
Test f1 score: 0.899330735206604
Evaluating: /data/ProcessedNYT/test_law.txt
Test score: 0.5373057723045349
Test accuracy: 0.7868852615356445
Test f1 score: 0.7968749403953552
Evaluating: /data/ProcessedNYT/test_finance.txt
Test score: 0.3735803961753845
Test accuracy: 0.8745098114013672
Test f1 score: 0.8742187023162842
Evaluating: /data/ProcessedNYT/test_education.txt
Test score: 0.478145956993103
Test accuracy: 0.8175675868988037
Test f1 score: 0.8124998807907104
Evaluating: /data/ProcessedNYT/test_politics.txt
Test score: 0.4200834333896637
Test accuracy: 0.8486171960830688
Test f1 score: 0.8485463857650757
Evaluating: /data/ProcessedNYT/test_medicine.txt
Test score: 0.5006592869758606
Test accuracy: 0.8052805066108704
Test f1 score: 0.8049999475479126


## Combined model: argfeat3 + sent_sum + pos_count

In [27]:
input_sent = Input(shape=(X_sent.shape[1],))
model_sent = Embedding(3, 128)(input_sent)
model_sent = SimpleRNN(128, dropout=0.2)(model_sent)

input_pos = Input(shape=(X_pos.shape[1],))
model_pos = Embedding(len(le.classes_), 128)(input_pos)
model_pos = SimpleRNN(128, dropout=0.2)(model_pos)

input_af3 = Input(shape=(X_argfeat3.shape[1],))
model_af3 = Embedding(3, 128)(input_af3)
model_af3 = SimpleRNN(128, dropout=0.2)(model_af3)

merged = Concatenate()([model_sent, model_pos, model_af3])
dense_pred = (Dense(2, activation='sigmoid'))(merged)

model = Model(inputs=[input_sent, input_pos, input_af3], outputs=dense_pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m])
print(model.summary())

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, 100)]        0           []                               
                                                                                                  
 input_12 (InputLayer)          [(None, 4500)]       0           []                               
                                                                                                  
 input_13 (InputLayer)          [(None, 100)]        0           []                               
                                                                                                  
 embedding_10 (Embedding)       (None, 100, 128)     384         ['input_11[0][0]']               
                                                                                            

In [28]:
# training
model.fit([X_sent_train, X_pos_train,X_af3_train],np.array(y_train),
          batch_size=batch_size,
          epochs=5,
          validation_data=([X_sent_val, X_pos_val,X_af3_val],np.array(y_val)))

model.save("af3_sent_pos.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    y_test, X_af3_test, X_sent_test, X_pos_test = process_test_df(df, af3=True, sent = True, pos=True)
    
    score, acc, f1 = model.evaluate([X_sent_test, X_pos_test, X_af3_test], y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/test_military.txt
Test score: 0.14191938936710358
Test accuracy: 0.9619834423065186
Test f1 score: 0.9639169573783875
Evaluating: /data/ProcessedNYT/test_law.txt
Test score: 0.253448486328125
Test accuracy: 0.8999999761581421
Test f1 score: 0.9032188653945923
Evaluating: /data/ProcessedNYT/test_finance.txt
Test score: 0.1652163565158844
Test accuracy: 0.9411764740943909
Test f1 score: 0.9411458373069763
Evaluating: /data/ProcessedNYT/test_education.txt
Test score: 0.22611041367053986
Test accuracy: 0.9020270109176636
Test f1 score: 0.9010887145996094
Evaluating: /data/ProcessedNYT/test_politics.txt
Test score: 0.1842302531003952
Test accuracy: 0.932314395904541
Test f1 score: 0.9320682883262634
Evaluating: /data/ProcessedNYT/test_medicine.txt
Test score: 0.3008972108364105
Test accuracy: 0.8745874762535095
Test f1 score: 0.878188967704773


## Combined model: argfeat6 + sent_sum + pos_count

In [10]:
input_sent = Input(shape=(X_sent.shape[1],))
model_sent = Embedding(3, 128)(input_sent)
model_sent = SimpleRNN(128, dropout=0.2)(model_sent)

input_pos = Input(shape=(X_pos.shape[1],))
model_pos = Embedding(len(le.classes_), 128)(input_pos)
model_pos = SimpleRNN(128, dropout=0.2)(model_pos)

input_af6 = Input(shape=(X_argfeat3.shape[1],))
model_af6 = Embedding(6, 128)(input_af6)
model_af6 = SimpleRNN(128, dropout=0.2)(model_af6)

merged = Concatenate()([model_sent, model_pos, model_af6])
dense_pred = (Dense(2, activation='sigmoid'))(merged)

model = Model(inputs=[input_sent, input_pos, input_af6], outputs=dense_pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m])
print(model.summary())

2023-04-09 23:20:40.075583: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-09 23:20:40.879439: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10413 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:84:00.0, compute capability: 6.1


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 4500)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 100, 128)     384         ['input_1[0][0]']                
                                                                                              

In [11]:
# training
model.fit([X_sent_train, X_pos_train,X_af6_train],np.array(y_train),
          batch_size=batch_size,
          epochs=5,
          validation_data=([X_sent_val, X_pos_val,X_af6_val],np.array(y_val)))

model.save("af6_sent_pos.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
for idx,df in enumerate(list_of_dfs):
    
    print('Evaluating:',list_of_files[idx])
    y_test, X_af6_test, X_sent_test, X_pos_test = process_test_df(df, af6=True, sent = True, pos=True)
    
    score, acc, f1 = model.evaluate([X_sent_test, X_pos_test, X_af6_test], y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('Test f1 score:', f1)

Evaluating: /data/ProcessedNYT/test_military.txt
Test score: 0.1596423089504242
Test accuracy: 0.9520661234855652
Test f1 score: 0.9521324634552002
Evaluating: /data/ProcessedNYT/test_law.txt
Test score: 0.2833259105682373
Test accuracy: 0.8672131299972534
Test f1 score: 0.8748556971549988
Evaluating: /data/ProcessedNYT/test_finance.txt
Test score: 0.18789760768413544
Test accuracy: 0.9196078181266785
Test f1 score: 0.9228276610374451
Evaluating: /data/ProcessedNYT/test_education.txt
Test score: 0.2193388044834137
Test accuracy: 0.9020270109176636
Test f1 score: 0.9097077250480652
Evaluating: /data/ProcessedNYT/test_politics.txt
Test score: 0.19857381284236908
Test accuracy: 0.9184861779212952
Test f1 score: 0.9198564887046814
Evaluating: /data/ProcessedNYT/test_medicine.txt
Test score: 0.28361776471138
Test accuracy: 0.8778877854347229
Test f1 score: 0.8826121091842651
