### Init

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support,balanced_accuracy_score
from gensim.models import Word2Vec,FastText
from nltk.tokenize import MWETokenizer,word_tokenize
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from tqdm.auto import tqdm
import copy
import gc
import os
import time
import warnings

path = r"C:\YZC\NUS\Semester 2\CS4248 Natural Language Processing\Project\scicite"
os.chdir(path)
if torch.cuda.is_available():
    device = 'cuda'
else: device = 'cpu'

#### Load data

In [2]:
df = pd.read_json('train.jsonl',lines=True)
df_test = pd.read_json('test.jsonl',lines=True)
df.head()

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence
0,explicit,175.0,Introduction,168.0,"However, how frataxin interacts with the Fe-S ...",background,1.0,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,,
1,explicit,36.0,Novel Quantitative Trait Loci for Seminal Root...,16.0,"In the study by Hickey et al. (2012), spikes w...",background,1.0,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,,
2,explicit,228.0,Introduction,225.0,"The drug also reduces catecholamine secretion,...",background,1.0,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,,
3,explicit,110.0,Discussion,46.0,By clustering with lowly aggressive close kin ...,background,1.0,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,,
4,explicit,239.0,Discussion,234.0,Ophthalmic symptoms are rare manifestations of...,background,1.0,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,,


In [3]:
'''
Questions:
excerpt_index?
citeStart, citeEnd?
label2 (supportiveness)?
isKeyCitation?
'''
no_print = True

### Preprocessing

In [4]:
def null_test():
    null_cols = []
    for col in df.columns:
        if df[col].isnull().any(): null_cols.append(col)
    print(null_cols)

#### Processing functions

In [21]:
tk = MWETokenizer([('<','bos','>'),('<','eos','>')],separator='')
# tk.add_mwe([('<','bos','>'),('<','eos','>')])

def convert_label(value):
    if value == 'background':
        label = 0
    elif value == 'method':
        label = 1
    elif value == 'result':
        label = 2
    return label

def NA_impute(df):
    df['label2'] = df['label2'].fillna('cant_determine')
    try:
        print(len(df).df.columns)
        df = df.drop(columns='label2_confidence',axis=1)
        df_type = 'train'
    except:
        df_type = 'test'
    df['label_confidence'] = df['label_confidence'].fillna(df['label_confidence'].mean())
    df['citeStart'] = df['citeStart'].fillna(df['citeStart'].mean().astype(np.int64))
    df['citeEnd'] = df['citeEnd'].fillna(df['citeEnd'].mean().astype(np.int64))
    df['source'] = df['source'].fillna('unknown')
    df['sectionName'] = df['sectionName'].fillna('unknown')
    return df, df_type

def add_sectionName(df):
    df['string_lower_sn'] = df.apply(lambda x:x.sectionName.lower()+' '+x.string_lower,axis=1)
    df['tokens_lower_sn'] = df['string_lower_sn'].apply(lambda x: tk.tokenize(word_tokenize(x)))
    return df

def process_df(df):
    df, df_type = NA_impute(df)
    for col in ['citeStart','citeEnd']:
        df[col] = df[col].astype('int64')
    feature_cols = ['source', 'citeEnd', 'sectionName', 'citeStart', 'label_confidence', 'citingPaperId', 'citedPaperId', 'isKeyCitation', 'excerpt_index', 'label2', 'label2_confidence']
    if df_type == 'test':
        feature_cols.remove('label2_confidence')
    df['edited_string'] = ''
    for col in feature_cols:
        df['edited_string'] += col + ': ' + df[col].astype(str) + '[SEP]'
    df['edited_string'] += df['string']
    df['tagged_string'] = '<BOS>' + df['string'] + '<EOS>'
    df['label_num'] = df['label'].apply(lambda x: convert_label(x))
    df['string_lower'] = df['tagged_string'].apply(lambda x: x.lower())
    df['tokens_lower'] = df['string_lower'].apply(lambda x: tk.tokenize(word_tokenize(x)))  
    return df

In [6]:
df_train = process_df(df)
df_test = process_df(df_test)
# Remove outliers, i.e. lengthy sentences
df_train = df_train.loc[df_train['tokens_lower'].str.len() <= 100]
df_train = df_train.reset_index(drop=True)

In [7]:
'''
Hypotheses:
Word embeddings
Combine word embeddings with features
Attention mechanism
Include the sentences immediately before and after
Activation functions

Open questions
- Where can we get more contextual information? How to incorporate?
- variables:
    - lower case? stopwords? lemmatization? any words to include?
'''
no_print = True

#### Preparing Embedding Arrays

In [8]:
'''
input array (batch_first = True): N (batch size) * L (sequence length) * H_in (input size)
'''
def generate_X(df,token_col,model,seq_len,vec_size):
    '''
    token_col: col for tokens in dataframe
    model: word vectorization model
    '''
    X = []
    for i in tqdm(range(len(df))):
        i_arr = [model.wv[token] for token in df.at[i,token_col]]
        if len(i_arr) < seq_len:
            while len(i_arr) < seq_len:
                i_arr.append(np.zeros(vec_size))
        elif len(i_arr) > seq_len:
            i_arr = i_arr[:seq_len]
        X.append(i_arr)
    return np.array(X)

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].to_numpy()

def generate_static_features(df):
    num_cols = ['citeStart','citeEnd','excerpt_index','isKeyCitation']
    cat_cols = ['label2']
    num_pipeline = Pipeline([
        ('selector',DataFrameSelector(num_cols)),
        ('scaler',StandardScaler())
    ])
    cat_pipeline = Pipeline([
        ('selector',DataFrameSelector(cat_cols)),
        ('ohe',OneHotEncoder())
    ])
    combined_pipeline = FeatureUnion([
        ('num',num_pipeline),
        ('cat',cat_pipeline)
    ])
    sf = combined_pipeline.fit_transform(df)
    return sf

def convert_y(y): # for training and validation labels. For y_val, keep one multi-target and one single-target version
    y_nn = np.zeros([y.shape[0],3])
    for i in range(len(y)):
        if y[i] == 0:
            y_nn[i,0] = 1
        elif y[i] == 1:
            y_nn[i,1] = 1
        elif y[i] == 2: 
            y_nn[i,2] = 1
    return y_nn

def to_tensor(arr,dtype='Float'):
    if dtype == "Float":
        arr = torch.from_numpy(arr).type(torch.FloatTensor)
    elif dtype == "Long":
        arr = torch.from_numpy(arr).type(torch.LongTensor)
    return arr

def check_dist_y(y,n=None):
    '''
    y: torch array
    n: last index of sample
    '''
    if n == None:
        print(torch.sum(y_train,dim=0))
    else:
        print(torch.sum(y_train[:n],dim=0))

In [25]:
vec_size = 200 # hp
seq_length = 100 # max word length based on tokenization, including padding tokens. Outliers removed (~ 1.5% of dataset)
corpus = df_train['tokens_lower'].tolist()
ft = FastText(corpus,vector_size=vec_size,epochs=10)
sf_core = generate_static_features(df_train)
sf_test = generate_static_features(df_test)
y_core = df_train['label_num'].to_numpy()
y_test = df_test['label_num'].to_numpy()

#####  For Subsequent Runs (with Tensors Saved), Ignore this Cell:
*i.e. Only run once*

In [25]:
X_core = generate_X(df_train,'tokens_lower',
                   ft,seq_length,vec_size)
X_test = generate_X(df_test,'tokens_lower',
                   ft,seq_length,vec_size)
X_train, X_val, sf_train, sf_val, y_train, y_val = train_test_split(X_core,sf_core,y_core,test_size=0.2,random_state= 1)
y_train, y_val_nn = convert_y(y_train),convert_y(y_val)
X_train, X_val, X_test, sf_train, sf_val, sf_test = to_tensor(X_train), to_tensor(X_val), to_tensor(X_test), to_tensor(sf_train.toarray()), to_tensor(sf_val.toarray()), to_tensor(sf_test.toarray()),
y_train, y_val_nn = to_tensor(y_train), to_tensor(y_val_nn)
torch.save([X_train,X_val,X_test,sf_train,sf_val,sf_test,y_train,y_val_nn],'data_arrays_v0.pt') # embeddings only
np.savez('y_arr_v0.npz',y_val=y_val,y_test=y_test)

  0%|          | 0/8117 [00:00<?, ?it/s]

  0%|          | 0/1861 [00:00<?, ?it/s]

##### For Subsequent Runs (Tensors Saved), Resume from this Cell:

In [12]:
# Load data
X_train,X_val,X_test,sf_train,sf_val,sf_test,y_train,y_val_nn = torch.load('data_arrays_v0.pt')
data = np.load('y_arr_v0.npz')
y_val = data['y_val']
y_test = data['y_test']

# Utility/Essential parameters
vec_size = 200 # hp
seq_length = 100 # max word length based on tokenization, including padding tokens
models_generated = ['rnn_0','rnn_a','rnn_a_sf','rnn_a_sf1']

In [14]:
def check_dist(arr):
    unique, counts = np.unique(arr,return_counts=True)
    return list(zip(unique,counts))
    
def resample_arr(X,y,rng): # Not used
    y_0 = y[:,0] == 1 # class 0
    y_non0 =  y[:,0] == 0 # class 1 or 2
    idx_0 = y_0.nonzero().numpy() # list of lists (numpy)
    idx_non0 = y_non0.nonzero() # list of lists (torch)
    idx_sample_0 = rng.choice(idx_0,len(idx_0)//3) # as size of class 0 ~ 4x that of other classes
    idx_sample_0 = torch.from_numpy(idx_sample_0).type(torch.LongTensor)
    idx_sample = torch.cat((idx_sample_0.squeeze(1),idx_non0.squeeze(1))) 
    del y_0,y_non0,idx_0,idx_non0,idx_sample_0
    return X[idx_sample], y[idx_sample]

def train_and_save(model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs):
    '''
    sf: [bool] whether static features are included or not
    '''
    X_val, sf_val = kwargs.get('X_val', None),kwargs.get('sf_val', None)
    best_loss = float('inf')
    best_epoch = 0
    for epoch in tqdm(range(n_epochs)):
        model.train()
        rng = np.random.default_rng()

        # Data resampling to offset class imbalance
#         X_sample, y_sample = resample_arr(X_train,y_train,rng)
#         check_dist(y_sample.numpy()[:,0]);check_dist(y_sample.numpy()[:,1]); check_dist(y_sample.numpy()[:,2]) # Troubleshooting
#         loader = DataLoader(TensorDataset(X_sample.to(device),y_sample.to(device)),batch_size=batch_size)

        if sf == False:
            for X_batch, y_batch in loader:
                y_pred_batch = model(X_batch)
                loss = loss_fn(y_pred_batch,y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        else:
            for X_batch, sf_batch, y_batch in loader:
                y_pred_batch = model(X_batch,sf_batch)
                loss = loss_fn(y_pred_batch,y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            if sf == False:
                y_pred_nn = model(X_val)
            else:
                y_pred_nn = model(X_val,sf_val)
            y_pred = torch.argmax(y_pred_nn,1).detach().cpu()
            y_dist = check_dist(y_pred) # Troubleshooting 
            print(y_dist) # Troubleshooting
            bal_accuracy = balanced_accuracy_score(y_val,y_pred)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                precision, recall, fscore, support = precision_recall_fscore_support(y_val,y_pred,average='weighted')
            val_loss = loss_fn(y_pred_nn,y_val_nn)
            tqdm.write(f"Epoch{epoch+1}: val loss={val_loss}, balanced accuracy={bal_accuracy}, precision={precision}, recall={recall}, fscore={fscore}")
            
            # Early Stopping
            if val_loss < best_loss:
                best_loss = val_loss
                best_epoch = epoch + 1
                best_model_weights = copy.deepcopy(model.state_dict())
                patience = 10
            else:
                patience -= 1
                if patience == 0:
                    print(f'Early stopping triggered at epoch {epoch+1}. Best model is from epoch {best_epoch}.')
                    break
        
        # For memory saving
        try:
            del X_sample,y_sample
            gc.collect()
        except:
            True
    torch.save(best_model_weights,filename)
    return model

#### Naive RNN

In [28]:
# See documentation: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

class RNN_base(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN_base,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.ff = nn.Linear(hidden_size,output_size)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self,x):
        h0 = torch.zeros(1,x.size(0),self.hidden_size).to(device)
        # 1: number of RNN layers. x.size(0): batch_size.
        _, hidden = self.rnn(x,h0)
        output = self.ff(hidden[-1])
        return output

In [49]:
# For reference
class0 = check_dist(y_train[:,0])[1][1]
class1 = check_dist(y_train[:,1])[1][1]
class2 = check_dist(y_train[:,2])[1][1]
print(class0/len(y_train),class1/len(y_train),class2/len(y_train))

0.5898660095487448 0.2782997073771754 0.13183428307407977


In [34]:
# HPs
hidden_size = 150
batch_size = 2
lr = 0.0004

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# # Build and train
# Building Approach 1: Using Module
rnn0 = RNN_base(input_size=vec_size,hidden_size=hidden_size,output_size=y_train.shape[1]).to(device)

# # Building Apprach 2: Using Sequential -> WIP: to enable weight analysis
# class generate_h(nn.Module):
#     def __init__(self,hidden_size):
#         super(generate_h,self).__init__()
#         self.hidden_size = hidden_size
#     def forward(self,x):
#         return torch.zeros(1,x.size(0),self.hidden_size).to(device)
# class generate

# rnn0 = nn.Sequential(
#     generate_h(hidden_size),
#     nn.RNN(input_size,hidden_size,batch_first=True),
#     nn.Linear(hidden_size,output_size)
# )

optimizer = optim.Adam(rnn0.parameters(),lr=lr)
# Weighted loss required for RNN to not assign all labels to one class
weights = torch.tensor([1, 2, 2.5]).to(device) #hp
loss_fn = nn.CrossEntropyLoss(weight=weights)
loader = DataLoader(TensorDataset(X_train.to(device),y_train.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn0 = train_and_save(rnn0,'RNN_v0.pt',
                      loss_fn,optimizer,
                      loader,
                      X_val.to(device),y_val_nn.to(device),y_val)

  0%|          | 0/40 [00:00<?, ?it/s]

[(0, 1623), (1, 1)]
Epoch1: val loss=0.9455477595329285, balanced accuracy=0.33408919123204833, precision=0.6154019801559479, recall=0.5868226600985221, fscore=0.434680364535915
[(0, 1623), (1, 1)]
Epoch2: val loss=0.9733210206031799, balanced accuracy=0.33408919123204833, precision=0.6154019801559479, recall=0.5868226600985221, fscore=0.434680364535915


KeyboardInterrupt: 

#### RNN with Attention

In [28]:
# Reference: https://github.com/mttk/rnn-classifier/blob/master/model.py

# Self attention
class Attention(nn.Module):
    def __init__(self):
        super(Attention,self).__init__()
    
    def forward(self,dec_hidden_state,enc_hidden_states):
        '''
        ('query': hidden from rnn) dec_hidden_states: 1 * batch_size * hidden_size
        ('key'/'value': output from rnn) enc_hidden_state: batch_size * seq_len * hidden_size
        '''
        attn_w = torch.bmm(dec_hidden_state.transpose(0,1),enc_hidden_states.transpose(1,2)) # (B*1*H,B*H*L)
        attn_w = torch.nn.functional.softmax(attn_w.squeeze(1),dim=1) # need to standardize with ^. (B*1*L -> B*L)
        context = torch.bmm(enc_hidden_states.transpose(1,2),attn_w.unsqueeze(2)).squeeze(2) # (B*H*L,B*L*1)
        return torch.cat((context,dec_hidden_state.squeeze(0)),dim=1) # batch_size * (2*hidden_size)
        
class RNN_Attn(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN_Attn,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.attn = Attention()
        self.dec = nn.Linear(2*hidden_size,output_size) # define context_size
#         self.softmax = nn.Softmax(dim=1)
        
    def forward(self,x):
        h0 = torch.zeros(1,x.size(0),self.hidden_size).to(device) # 1: number of RNN layers. x.size(0): batch_size.
        output, hidden = self.rnn(x,h0) # hidden: enc_hidden. output: dec_hidden
        c_h = self.attn(hidden,output)
        output = self.dec(c_h)
#         output = self.softmax(output)
        return output

In [38]:
# HPs
hidden_size = 70
batch_size = 2 # OOM when batch_size >= 4
lr = 0.0002

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# Build and train
rnn_a = RNN_Attn(input_size=vec_size,hidden_size=hidden_size,output_size=y_train.shape[1]).to(device)
optimizer = optim.Adam(rnn_a.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train.to(device),y_train.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn_a = train_and_save(rnn_a,'RNN_Attn_v0.pt',
                     loss_fn,optimizer,
                     loader,X_val.to(device),y_val_nn.to(device),y_val)

  0%|          | 0/40 [00:00<?, ?it/s]

[(0, 978), (1, 444), (2, 202)]
Epoch1: val loss=0.5450728535652161, balanced accuracy=0.7415880493611585, precision=0.7862471267481351, recall=0.7875615763546798, fscore=0.7863688411440127


KeyboardInterrupt: 

In [None]:
'''
(Reference) Getting model weights: 
https://stackoverflow.com/questions/44130851/simple-lstm-in-pytorch-with-sequential-module
https://discuss.pytorch.org/t/how-to-get-all-weights-of-rnn-in-pytorch/33794/2
'''
no_print = True

#### RNN_Attn with Static Features

Build and Train Model:

In [29]:
class RNN_Attn_StatFeat(nn.Module):
    def __init__(self,input_size,feat_size,hidden_size,hidden_size2,output_size):
        super(RNN_Attn_StatFeat,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.attn = Attention()
        self.dec = nn.Linear(2*hidden_size,hidden_size2) # define context_size
        self.ff1 = nn.Linear(hidden_size2+feat_size,output_size) # combined layer
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self,x,sf): # x: embeddings. sf: static features
        h0 = torch.zeros(1,x.size(0),self.hidden_size).to(device) # 1: number of RNN layers. x.size(0): batch_size.
        output, hidden = self.rnn(x,h0) # hidden: enc_hidden. output: dec_hidden
        context_h = self.attn(hidden,output)
        context_h2 = self.dec(context_h)
        output = self.ff1(torch.cat((context_h2,sf),dim=1))
        output = self.softmax(output)
        return output

In [50]:
# HPs
hidden_size = 70
hidden_size2 = 50
batch_size = 2 # OOM when batch_size >= 4
feat_size = sf_train.shape[1]
lr = 0.0002

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# Build and train
'''
RNN_Attn_StatFeat args: input_size,feat_size,hidden_size,hidden_size2,output_size
'''
rnn_a_sf = RNN_Attn_StatFeat(input_size=vec_size,feat_size=feat_size,
                             hidden_size=hidden_size,hidden_size2=hidden_size2,
                             output_size=y_train.shape[1]).to(device)
optimizer = optim.Adam(rnn_a_sf.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train.to(device),sf_train.to(device),y_train.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn_a_sf = train_and_save(rnn_a_sf,'RNN_Attn_sf.pt',
                     loss_fn,optimizer,
                     loader,y_val_nn.to(device),y_val,sf=True,X_val=X_val.to(device),sf_val=sf_val.to(device))

  0%|          | 0/40 [00:00<?, ?it/s]

[(0, 1617), (1, 7)]
Epoch1: val loss=0.946342945098877, balanced accuracy=0.3333333333333333, precision=0.3451261382295865, recall=0.5862068965517241, fscore=0.4344639669266185
[(0, 1440), (1, 183), (2, 1)]
Epoch2: val loss=0.9002696871757507, balanced accuracy=0.42468654128317995, precision=0.577262342189561, recall=0.6483990147783252, fscore=0.5685871006804291
[(0, 1329), (1, 294), (2, 1)]
Epoch3: val loss=0.8863833546638489, balanced accuracy=0.4574329731892757, precision=0.5705987501250988, recall=0.6631773399014779, fscore=0.5981036318441268
[(0, 1151), (1, 297), (2, 176)]
Epoch4: val loss=0.8215010166168213, balanced accuracy=0.6578262618178584, precision=0.7356138425584221, recall=0.7376847290640394, fscore=0.7245454724733096
[(0, 1023), (1, 428), (2, 173)]
Epoch5: val loss=0.7423332333564758, balanced accuracy=0.7528385091410302, precision=0.8094902690683033, recall=0.8084975369458128, fscore=0.8060462515042427
[(0, 948), (1, 494), (2, 182)]
Epoch6: val loss=0.7397065162658691,

#### RNN_Attn_StatFeat with Section Header

##### For Subsequent Runs (Tensors Saved), Ignore this Cell:

In [20]:
# Generic inclusion
df_train, df_test = add_sectionName(df_train), add_sectionName(df_test)
df_train = df_train.loc[df_train['tokens_lower'].str.len() <= 100]
df_train = df_train.reset_index(drop=True)
corpus_sn = df_train['tokens_lower_sn'].tolist()
ft_sn = FastText(corpus_sn,vector_size=vec_size,epochs=10)
# unchanged: sf_core, y_core. Note:
    # sf_core is not a tensor (sf_train and sf_val definitely are, but not sf_train_sn and sf_val_sn)
    # sf_test and y_test no change, as no changes to features and not implicated by the repeat of train-val splits
X_core_sn = generate_X(df_train,'tokens_lower_sn',
                   ft,seq_length,vec_size)
X_test_sn = generate_X(df_test,'tokens_lower_sn',
                   ft,seq_length,vec_size)
X_train_sn, X_val_sn, sf_train_sn, sf_val_sn, y_train_sn, y_val_sn = train_test_split(X_core_sn,sf_core,y_core,test_size=0.2,random_state= 1)
y_train_sn, y_val_sn_nn = convert_y(y_train_sn),convert_y(y_val_sn)
if torch.is_tensor(sf_test):
    X_train_sn, X_val_sn, X_test_sn, sf_train_sn, sf_val_sn = to_tensor(X_train_sn), to_tensor(X_val_sn), to_tensor(X_test_sn), to_tensor(sf_train_sn.toarray()), to_tensor(sf_val_sn.toarray())
else:
    X_train_sn, X_val_sn, X_test_sn, sf_train_sn, sf_val_sn, sf_test = to_tensor(X_train_sn), to_tensor(X_val_sn), to_tensor(X_test_sn), to_tensor(sf_train_sn.toarray()), to_tensor(sf_val_sn.toarray()), to_tensor(sf_test.toarray())
y_train_sn, y_val_sn_nn = to_tensor(y_train_sn), to_tensor(y_val_sn_nn)
torch.save([X_train_sn,X_val_sn,X_test_sn,sf_train_sn,sf_val_sn,sf_test,y_train_sn,y_val_sn_nn],'data_arrays_sn_v0.pt') # embeddings only
np.savez('y_arr_sn_v0.npz',y_val=y_val_sn,y_test=y_test)

##### For Subsequent Runs (Tensors Saved), Resume from this Cell:

In [11]:
X_train_sn,X_val_sn,X_test_sn,sf_train_sn,sf_val_sn,sf_test,y_train_sn,y_val_sn_nn = torch.load('data_arrays_sn_v0.pt')
data_sn = np.load('y_arr_sn_v0.npz')
y_val_sn = data_sn['y_val']
y_test = data_sn['y_test']

In [60]:
# HPs
hidden_size = 70
hidden_size2 = 50
batch_size = 2 # OOM when batch_size >= 4
feat_size = sf_train_sn.shape[1]
lr = 0.0002

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# Build and train
'''
RNN_Attn_StatFeat args: input_size,feat_size,hidden_size,hidden_size2,output_size
'''
rnn_a_sf1 = RNN_Attn_StatFeat(input_size=vec_size,feat_size=feat_size,
                             hidden_size=hidden_size,hidden_size2=hidden_size2,
                             output_size=y_train_sn.shape[1]).to(device)
optimizer = optim.Adam(rnn_a_sf1.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train_sn.to(device),sf_train_sn.to(device),y_train_sn.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn_a_sf1 = train_and_save(rnn_a_sf1,'RNN_Attn_sf_v1.pt',
                     loss_fn,optimizer,
                     loader,y_val_sn_nn.to(device),y_val_sn,sf=True,X_val=X_val_sn.to(device),sf_val=sf_val_sn.to(device))

  0%|          | 0/40 [00:00<?, ?it/s]

[(0, 1285), (1, 338), (2, 1)]
Epoch1: val loss=0.8915425539016724, balanced accuracy=0.4710606464808145, precision=0.572199819063369, recall=0.6650246305418719, fscore=0.6056641833643287
[(0, 1249), (1, 374), (2, 1)]
Epoch2: val loss=0.8438905477523804, balanced accuracy=0.514917077942288, precision=0.6095374455566667, recall=0.7050492610837439, fscore=0.6474837485163338
[(0, 1266), (1, 357), (2, 1)]
Epoch3: val loss=0.8287297487258911, balanced accuracy=0.5219198790627363, precision=0.6230685361250237, recall=0.7173645320197044, fscore=0.6585172217412721
[(0, 1082), (1, 392), (2, 150)]
Epoch4: val loss=0.7485403418540955, balanced accuracy=0.7343482847684527, precision=0.8131044612694973, recall=0.8078817733990148, fscore=0.8028704986792734
[(0, 995), (1, 467), (2, 162)]
Epoch5: val loss=0.719699501991272, balanced accuracy=0.7801868222036289, precision=0.8365693028220017, recall=0.8318965517241379, fscore=0.8303569263537796
[(0, 953), (1, 500), (2, 171)]
Epoch6: val loss=0.7110909819

#### Word Order
Based on RNN_Attn_StatFeat with sectionName

##### For Subsequent Runs (Tensors Saved), Ignore this Cell:

In [34]:
# Generic inclusion
def randomize_tokens(tk_list,rng):
    '''
    Based on tokens with sectionName
    Keep positions of: sectionName bos, eos
    '''
    target_list = tk_list[2:-1]
    reordered_list = tk_list[:2] + rng.permutation(target_list).tolist() + [tk_list[-1]]
    return reordered_list

def generate_reordered_tokens(df):
    rng1 = np.random.default_rng()
    df['tokens_lower_sn_jumbled'] = df['tokens_lower_sn'].apply(lambda x:randomize_tokens(x,rng1))
    return df

# Regenerate col again (in case the cell above hasn't been run)
df_train, df_test = add_sectionName(df_train), add_sectionName(df_test)
df_train = df_train.loc[df_train['tokens_lower'].str.len() <= 100]
df_train = df_train.reset_index(drop=True)
df_train, df_test = generate_reordered_tokens(df_train), generate_reordered_tokens(df_test)
# unchanged: sf_core, y_core. Note:
    # sf_core is not a tensor (sf_train and sf_val definitely are, but not sf_train_sn and sf_val_sn)
    # y_test has already been loaded
    # sf_test and y_test no change, as no changes to features and not implicated by the repeat of train-val splits
X_core_sn_ro = generate_X(df_train,'tokens_lower_sn',
                   ft,seq_length,vec_size)
X_test_sn_ro = generate_X(df_test,'tokens_lower_sn',
                   ft,seq_length,vec_size)
X_train_sn_ro, X_val_sn_ro, sf_train_sn_ro, sf_val_sn_ro, y_train_sn_ro, y_val_sn_ro = train_test_split(X_core_sn_ro,sf_core,y_core,test_size=0.2,random_state= 1)
y_train_sn_ro, y_val_sn_ro_nn = convert_y(y_train_sn_ro),convert_y(y_val_sn_ro)
if torch.is_tensor(sf_test): # Depends on whether sf_test has been loaded as or converted to tensor previously
    X_train_sn_ro, X_val_sn_ro, X_test_sn_ro, sf_train_sn_ro, sf_val_sn_ro = to_tensor(X_train_sn_ro), to_tensor(X_val_sn_ro), to_tensor(X_test_sn_ro), to_tensor(sf_train_sn_ro.toarray()), to_tensor(sf_val_sn_ro.toarray())
else:
    X_train_sn_ro, X_val_sn_ro, X_test_sn_ro, sf_train_sn_ro, sf_val_sn_ro, sf_test = to_tensor(X_train_sn_ro), to_tensor(X_val_sn_ro), to_tensor(X_test_sn_ro), to_tensor(sf_train_sn_ro.toarray()), to_tensor(sf_val_sn_ro.toarray()), to_tensor(sf_test.toarray())
y_train_sn_ro, y_val_sn_ro_nn = to_tensor(y_train_sn_ro), to_tensor(y_val_sn_ro_nn)
torch.save([X_train_sn_ro,X_val_sn_ro,X_test_sn_ro,sf_train_sn_ro,sf_val_sn_ro,sf_test,y_train_sn_ro,y_val_sn_ro_nn],'data_arrays_sn_ro_v0.pt') # embeddings only
np.savez('y_arr_sn_ro_v0.npz',y_val=y_val_sn_ro,y_test=y_test)

  0%|          | 0/8117 [00:00<?, ?it/s]

  0%|          | 0/1861 [00:00<?, ?it/s]

##### For Subsequent Runs (Tensors Saved), Resume from this Cell:

In [36]:
X_train_sn_ro,X_val_sn_ro,X_test_sn_ro,sf_train_sn_ro,sf_val_sn_ro,sf_test,y_train_sn_ro,y_val_sn_ro_nn = torch.load('data_arrays_sn_ro_v0.pt')
data_sn = np.load('y_arr_sn_ro_v0.npz')
y_val_sn_ro = data_sn['y_val']
y_test = data_sn['y_test']

In [37]:
# HPs
hidden_size = 70
hidden_size2 = 50
batch_size = 2 # OOM when batch_size >= 4
feat_size = sf_train_sn_ro.shape[1]
lr = 0.0002

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# Build and train
'''
RNN_Attn_StatFeat args: input_size,feat_size,hidden_size,hidden_size2,output_size
'''
rnn_a_sf2 = RNN_Attn_StatFeat(input_size=vec_size,feat_size=feat_size,
                             hidden_size=hidden_size,hidden_size2=hidden_size2,
                             output_size=y_train_sn_ro.shape[1]).to(device)
optimizer = optim.Adam(rnn_a_sf2.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train_sn_ro.to(device),sf_train_sn_ro.to(device),y_train_sn_ro.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn_a_sf2 = train_and_save(rnn_a_sf2,'RNN_Attn_sf_v2.pt',
                     loss_fn,optimizer,
                     loader,y_val_sn_ro_nn.to(device),y_val_sn_ro,sf=True,X_val=X_val_sn_ro.to(device),sf_val=sf_val_sn_ro.to(device))

  0%|          | 0/40 [00:00<?, ?it/s]

[(0, 1372), (1, 252)]
Epoch1: val loss=0.8767874240875244, balanced accuracy=0.456504824151883, precision=0.5842245149291243, recall=0.6779556650246306, fscore=0.6042967211434183
[(0, 1213), (1, 411)]
Epoch2: val loss=0.8053643107414246, balanced accuracy=0.5610577564359077, precision=0.6481448146796317, recall=0.7512315270935961, fscore=0.6923192215329085
[(0, 1194), (1, 430)]
Epoch3: val loss=0.7972503900527954, balanced accuracy=0.5648203725934818, precision=0.6459192346388452, recall=0.75, fscore=0.6917543476055908
[(0, 1212), (1, 412)]
Epoch4: val loss=0.8002452850341797, balanced accuracy=0.5637866257614157, precision=0.649815005563161, recall=0.7524630541871922, fscore=0.6938534790805186
[(0, 1177), (1, 447)]
Epoch5: val loss=0.7956586480140686, balanced accuracy=0.5702225334578276, precision=0.6474235604623604, recall=0.7530788177339901, fscore=0.6948112746052932


KeyboardInterrupt: 

#### Other Hyperparam-like Variables

In [None]:
'''
To test:
- 1 more hidden layer for RNN_Attn_StatFeat?
'''
no_print=True

In [70]:
'''
Results:
- RNN_Attn - Epoch13: val loss=0.45884597301483154, balanced accuracy=0.755593651602055, precision=0.8039308068696814, recall=0.8060344827586207, fscore=0.8028471981210324
- RNN_Attn_StatFeat - Epoch29: val loss=0.6968145370483398, balanced accuracy=0.819888056232594, precision=0.8485338182885594, recall=0.8497536945812808, fscore=0.8471742309103825
- RNN_Attn_StatFeat with sectionName - Epoch26: val loss=0.6687561273574829, balanced accuracy=0.8488238729835368, precision=0.8824253380976532, recall=0.8811576354679803, fscore=0.879580416187504

Observations:
- Results (esp for RNN_Attn) are worse with softmax than without

Actions taken to address problem of assigning all samples to one class:
- Removed outlier samples with extremely high word counts (>100) (worked for Attn. RNN unaffected)
- Resampling (tried for RNN; didn't work)
- Weighted loss (tried for RNN; didn't work)
'''
no_print = True

### Archives

In [None]:
'''
pytorch lightning, lightning fabric
'''
no_print=True