### Init

In [5]:
import pandas as pd
import numpy as np
import math
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support,balanced_accuracy_score
from gensim.models import Word2Vec,FastText
from nltk.tokenize import MWETokenizer,word_tokenize
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from tqdm.auto import tqdm
import copy
import gc
import os
import time
import warnings

path = r"data/scicite"
os.chdir(path)
if torch.cuda.is_available():
    device = 'cuda'
else: device = 'cpu'

#### Load data

In [6]:
df = pd.read_json('train.jsonl',lines=True)
df_test = pd.read_json('test.jsonl',lines=True)
df.head()

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence
0,explicit,175.0,Introduction,168.0,"However, how frataxin interacts with the Fe-S ...",background,1.0,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,,
1,explicit,36.0,Novel Quantitative Trait Loci for Seminal Root...,16.0,"In the study by Hickey et al. (2012), spikes w...",background,1.0,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,,
2,explicit,228.0,Introduction,225.0,"The drug also reduces catecholamine secretion,...",background,1.0,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,,
3,explicit,110.0,Discussion,46.0,By clustering with lowly aggressive close kin ...,background,1.0,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,,
4,explicit,239.0,Discussion,234.0,Ophthalmic symptoms are rare manifestations of...,background,1.0,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,,


In [7]:
'''
Questions:
excerpt_index?
citeStart, citeEnd?
label2 (supportiveness)?
isKeyCitation?
'''
no_print = True

### Preprocessing

In [8]:
def null_test():
    null_cols = []
    for col in df.columns:
        if df[col].isnull().any(): null_cols.append(col)
    print(null_cols)

#### Processing functions

In [10]:
tk = MWETokenizer([('<','bos','>'),('<','eos','>')],separator='')
# tk.add_mwe([('<','bos','>'),('<','eos','>')])

def convert_label(value):
    if value == 'background':
        label = 0
    elif value == 'method':
        label = 1
    elif value == 'result':
        label = 2
    return label

def NA_impute(df):
    df['label2'] = df['label2'].fillna('cant_determine')
    try:
        print(len(df).df.columns)
        df = df.drop(columns='label2_confidence',axis=1)
        df_type = 'train'
    except:
        df_type = 'test'
    df['label_confidence'] = df['label_confidence'].fillna(df['label_confidence'].mean())
    df['citeStart'] = df['citeStart'].fillna(df['citeStart'].mean().astype(np.int64))
    df['citeEnd'] = df['citeEnd'].fillna(df['citeEnd'].mean().astype(np.int64))
    df['source'] = df['source'].fillna('unknown')
    df['sectionName'] = df['sectionName'].fillna('unknown')
    return df, df_type

def add_sectionName(df):
    df['string_lower_sn'] = df.apply(lambda x:x.sectionName.lower()+' '+x.string_lower,axis=1)
    df['tokens_lower_sn'] = df['string_lower_sn'].apply(lambda x: tk.tokenize(word_tokenize(x)))
    return df

def process_df(df):
    df, df_type = NA_impute(df)
    for col in ['citeStart','citeEnd']:
        df[col] = df[col].astype('int64')
    feature_cols = ['source', 'citeEnd', 'sectionName', 'citeStart', 'label_confidence', 'citingPaperId', 'citedPaperId', 'isKeyCitation', 'excerpt_index', 'label2', 'label2_confidence']
    if df_type == 'test':
        feature_cols.remove('label2_confidence')
    df['edited_string'] = ''
    for col in feature_cols:
        df['edited_string'] += col + ': ' + df[col].astype(str) + '[SEP]'
    df['edited_string'] += df['string']
    df['tagged_string'] = '<BOS>' + df['string'] + '<EOS>'
    df['label_num'] = df['label'].apply(lambda x: convert_label(x))
    df['string_lower'] = df['tagged_string'].apply(lambda x: x.lower())
    df['tokens_lower'] = df['string_lower'].apply(lambda x: tk.tokenize(word_tokenize(x)))  
    return df

In [11]:
df_train = process_df(df)
df_test = process_df(df_test)
# Remove outliers, i.e. lengthy sentences
df_train = df_train.loc[df_train['tokens_lower'].str.len() <= 100]
df_train = df_train.reset_index(drop=True)

In [12]:
'''
Hypotheses:
Word embeddings
Combine word embeddings with features
Attention mechanism
Include the sentences immediately before and after
Activation functions

Open questions
- Where can we get more contextual information? How to incorporate?
- variables:
    - lower case? stopwords? lemmatization? any words to include?
'''
no_print = True

#### Preparing Embedding Arrays

In [13]:
'''
input array (batch_first = True): N (batch size) * L (sequence length) * H_in (input size)
'''
def generate_X(df,token_col,model,seq_len,vec_size):
    '''
    token_col: col for tokens in dataframe
    model: word vectorization model
    '''
    X = []
    for i in tqdm(range(len(df))):
        i_arr = [model.wv[token] for token in df.at[i,token_col]]
        if len(i_arr) < seq_len:
            while len(i_arr) < seq_len:
                i_arr.append(np.zeros(vec_size))
        elif len(i_arr) > seq_len:
            i_arr = i_arr[:seq_len]
        X.append(i_arr)
    return np.array(X)

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].to_numpy()

def generate_static_features(df):
    num_cols = ['citeStart','citeEnd','excerpt_index','isKeyCitation']
    cat_cols = ['label2']
    num_pipeline = Pipeline([
        ('selector',DataFrameSelector(num_cols)),
        ('scaler',StandardScaler())
    ])
    cat_pipeline = Pipeline([
        ('selector',DataFrameSelector(cat_cols)),
        ('ohe',OneHotEncoder())
    ])
    combined_pipeline = FeatureUnion([
        ('num',num_pipeline),
        ('cat',cat_pipeline)
    ])
    sf = combined_pipeline.fit_transform(df)
    return sf

def convert_y(y): # for training and validation labels. For y_val, keep one multi-target and one single-target version
    y_nn = np.zeros([y.shape[0],3])
    for i in range(len(y)):
        if y[i] == 0:
            y_nn[i,0] = 1
        elif y[i] == 1:
            y_nn[i,1] = 1
        elif y[i] == 2: 
            y_nn[i,2] = 1
    return y_nn

def to_tensor(arr,dtype='Float'):
    if dtype == "Float":
        arr = torch.from_numpy(arr).type(torch.FloatTensor)
    elif dtype == "Long":
        arr = torch.from_numpy(arr).type(torch.LongTensor)
    return arr

def check_dist_y(y,n=None):
    '''
    y: torch array
    n: last index of sample
    '''
    if n == None:
        print(torch.sum(y_train,dim=0))
    else:
        print(torch.sum(y_train[:n],dim=0))

In [14]:
vec_size = 200 # hp
seq_length = 100 # max word length based on tokenization, including padding tokens. Outliers removed (~ 1.5% of dataset)
corpus = df_train['tokens_lower'].tolist()
ft = FastText(corpus,vector_size=vec_size,epochs=10)
sf_core = generate_static_features(df_train)
sf_test = generate_static_features(df_test)
y_core = df_train['label_num'].to_numpy()
y_test = df_test['label_num'].to_numpy()

#####  For Subsequent Runs (with Tensors Saved), Ignore this Cell:
*i.e. Only run once*

In [15]:
X_core = generate_X(df_train,'tokens_lower',
                   ft,seq_length,vec_size)
X_test = generate_X(df_test,'tokens_lower',
                   ft,seq_length,vec_size)
X_train, X_val, sf_train, sf_val, y_train, y_val = train_test_split(X_core,sf_core,y_core,test_size=0.2,random_state= 1)
y_train, y_val_nn = convert_y(y_train),convert_y(y_val)
X_train, X_val, X_test, sf_train, sf_val, sf_test = to_tensor(X_train), to_tensor(X_val), to_tensor(X_test), to_tensor(sf_train.toarray()), to_tensor(sf_val.toarray()), to_tensor(sf_test.toarray()),
y_train, y_val_nn = to_tensor(y_train), to_tensor(y_val_nn)
torch.save([X_train,X_val,X_test,sf_train,sf_val,sf_test,y_train,y_val_nn],'data_arrays_v0.pt') # embeddings only
np.savez('y_arr_v0.npz',y_val=y_val,y_test=y_test)

100%|██████████| 8117/8117 [00:04<00:00, 2026.66it/s]
100%|██████████| 1861/1861 [00:02<00:00, 680.51it/s] 


##### For Subsequent Runs (Tensors Saved), Resume from this Cell:

In [16]:
# Load data
X_train,X_val,X_test,sf_train,sf_val,sf_test,y_train,y_val_nn = torch.load('data_arrays_v0.pt')
data = np.load('y_arr_v0.npz')
y_val = data['y_val']
y_test = data['y_test']

# Utility/Essential parameters
vec_size = 200 # hp
seq_length = 100 # max word length based on tokenization, including padding tokens
models_generated = ['rnn_0','rnn_a','rnn_a_sf','rnn_a_sf1']

In [17]:
def check_dist(arr):
    unique, counts = np.unique(arr,return_counts=True)
    return list(zip(unique,counts))
    
def resample_arr(X,y,rng): # Not used
    y_0 = y[:,0] == 1 # class 0
    y_non0 =  y[:,0] == 0 # class 1 or 2
    idx_0 = y_0.nonzero().numpy() # list of lists (numpy)
    idx_non0 = y_non0.nonzero() # list of lists (torch)
    idx_sample_0 = rng.choice(idx_0,len(idx_0)//3) # as size of class 0 ~ 4x that of other classes
    idx_sample_0 = torch.from_numpy(idx_sample_0).type(torch.LongTensor)
    idx_sample = torch.cat((idx_sample_0.squeeze(1),idx_non0.squeeze(1))) 
    del y_0,y_non0,idx_0,idx_non0,idx_sample_0
    return X[idx_sample], y[idx_sample]

def train_and_save(model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs):
    '''
    sf: [bool] whether static features are included or not
    '''
    X_val, sf_val = kwargs.get('X_val', None),kwargs.get('sf_val', None)
    best_loss = float('inf')
    best_epoch = 0
    for epoch in tqdm(range(n_epochs)):
        model.train()
        rng = np.random.default_rng()

        # Data resampling to offset class imbalance
#         X_sample, y_sample = resample_arr(X_train,y_train,rng)
#         check_dist(y_sample.numpy()[:,0]);check_dist(y_sample.numpy()[:,1]); check_dist(y_sample.numpy()[:,2]) # Troubleshooting
#         loader = DataLoader(TensorDataset(X_sample.to(device),y_sample.to(device)),batch_size=batch_size)

        if sf == False:
            for X_batch, y_batch in loader:
                y_pred_batch = model(X_batch)
                loss = loss_fn(y_pred_batch,y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        else:
            for X_batch, sf_batch, y_batch in loader:
                y_pred_batch = model(X_batch,sf_batch)
                loss = loss_fn(y_pred_batch,y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            if sf == False:
                y_pred_nn = model(X_val)
            else:
                y_pred_nn = model(X_val,sf_val)
            y_pred = torch.argmax(y_pred_nn,1).detach().cpu()
            y_dist = check_dist(y_pred) # Troubleshooting 
            print(y_dist) # Troubleshooting
            bal_accuracy = balanced_accuracy_score(y_val,y_pred)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                precision, recall, fscore, support = precision_recall_fscore_support(y_val,y_pred,average='weighted')
            val_loss = loss_fn(y_pred_nn,y_val_nn)
            tqdm.write(f"Epoch{epoch+1}: val loss={val_loss}, balanced accuracy={bal_accuracy}, precision={precision}, recall={recall}, fscore={fscore}")
            
            # Early Stopping
            if val_loss < best_loss:
                best_loss = val_loss
                best_epoch = epoch + 1
                best_model_weights = copy.deepcopy(model.state_dict())
                patience = 10
            else:
                patience -= 1
                if patience == 0:
                    print(f'Early stopping triggered at epoch {epoch+1}. Best model is from epoch {best_epoch}.')
                    break
        
        # For memory saving
        try:
            del X_sample,y_sample
            gc.collect()
        except:
            True
    torch.save(best_model_weights,filename)
    return model

#### Naive RNN

In [18]:
# See documentation: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

class RNN_base(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN_base,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.ff = nn.Linear(hidden_size,output_size)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self,x):
        h0 = torch.zeros(1,x.size(0),self.hidden_size).to(device)
        # 1: number of RNN layers. x.size(0): batch_size.
        _, hidden = self.rnn(x,h0)
        output = self.ff(hidden[-1])
        return output

In [23]:
# For reference
class0 = check_dist(y_train[:,0])[1][1]
class1 = check_dist(y_train[:,1])[1][1]
class2 = check_dist(y_train[:,2])[1][1]
print(class0/len(y_train),class1/len(y_train),class2/len(y_train))
print(class0 + class1 + class2 == len(y_train))


0.5898660095487448 0.2782997073771754 0.13183428307407977


In [None]:
# HPs
hidden_size = 150
batch_size = 2
lr = 0.0004

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# # Build and train
# Building Approach 1: Using Module
rnn0 = RNN_base(input_size=vec_size,hidden_size=hidden_size,output_size=y_train.shape[1]).to(device)

# # Building Apprach 2: Using Sequential -> WIP: to enable weight analysis
# class generate_h(nn.Module):
#     def __init__(self,hidden_size):
#         super(generate_h,self).__init__()
#         self.hidden_size = hidden_size
#     def forward(self,x):
#         return torch.zeros(1,x.size(0),self.hidden_size).to(device)
# class generate

# rnn0 = nn.Sequential(
#     generate_h(hidden_size),
#     nn.RNN(input_size,hidden_size,batch_first=True),
#     nn.Linear(hidden_size,output_size)
# )

optimizer = optim.Adam(rnn0.parameters(),lr=lr)
# Weighted loss required for RNN to not assign all labels to one class
weights = torch.tensor([1, 2, 2.5]).to(device) #hp
loss_fn = nn.CrossEntropyLoss(weight=weights)
loader = DataLoader(TensorDataset(X_train.to(device),y_train.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn0 = train_and_save(rnn0,'RNN_v0.pt',
                      loss_fn,optimizer,
                      loader,y_val_nn.to(device),y_val,X_val=X_val.to(device))

#### RNN with Attention

In [None]:
# Reference: https://github.com/mttk/rnn-classifier/blob/master/model.py

# Self attention
class Attention(nn.Module):
    def __init__(self):
        super(Attention,self).__init__()
    
    def forward(self,dec_hidden_state,enc_hidden_states):
        '''
        ('query': hidden from rnn) dec_hidden_states: 1 * batch_size * hidden_size
        ('key'/'value': output from rnn) enc_hidden_state: batch_size * seq_len * hidden_size
        '''
        attn_w = torch.bmm(dec_hidden_state.transpose(0,1),enc_hidden_states.transpose(1,2)) # (B*1*H,B*H*L)
        attn_w = torch.nn.functional.softmax(attn_w.squeeze(1),dim=1) # need to standardize with ^. (B*1*L -> B*L)
        context = torch.bmm(enc_hidden_states.transpose(1,2),attn_w.unsqueeze(2)).squeeze(2) # (B*H*L,B*L*1)
        return torch.cat((context,dec_hidden_state.squeeze(0)),dim=1) # batch_size * (2*hidden_size)
        
class RNN_Attn(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN_Attn,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.attn = Attention()
        self.dec = nn.Linear(2*hidden_size,output_size) # define context_size
#         self.softmax = nn.Softmax(dim=1)
        
    def forward(self,x):
        h0 = torch.zeros(1,x.size(0),self.hidden_size).to(device) # 1: number of RNN layers. x.size(0): batch_size.
        output, hidden = self.rnn(x,h0) # hidden: enc_hidden. output: dec_hidden
        c_h = self.attn(hidden,output)
        output = self.dec(c_h)
#         output = self.softmax(output)
        return output

In [None]:
# HPs
hidden_size = 70
batch_size = 2 # OOM when batch_size >= 4
lr = 0.0002

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# Build and train
rnn_a = RNN_Attn(input_size=vec_size,hidden_size=hidden_size,output_size=y_train.shape[1]).to(device)
optimizer = optim.Adam(rnn_a.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train.to(device),y_train.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn_a = train_and_save(rnn_a,'RNN_Attn_v0.pt',
                     loss_fn,optimizer,
                     loader,X_val.to(device),y_val_nn.to(device),y_val)

In [None]:
'''
(Reference) Getting model weights: 
https://stackoverflow.com/questions/44130851/simple-lstm-in-pytorch-with-sequential-module
https://discuss.pytorch.org/t/how-to-get-all-weights-of-rnn-in-pytorch/33794/2
'''
no_print = True

#### RNN_Attn with Static Features

Build and Train Model:

In [33]:
class RNN_Attn_StatFeat(nn.Module):
    def __init__(self,input_size,feat_size,hidden_size,hidden_size2,output_size):
        super(RNN_Attn_StatFeat,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.attn = Attention()
        self.dec = nn.Linear(2*hidden_size,hidden_size2) # define context_size
        self.ff1 = nn.Linear(hidden_size2+feat_size,output_size) # combined layer
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self,x,sf): # x: embeddings. sf: static features
        h0 = torch.zeros(1,x.size(0),self.hidden_size).to(device) # 1: number of RNN layers. x.size(0): batch_size.
        output, hidden = self.rnn(x,h0) # hidden: enc_hidden. output: dec_hidden
        context_h = self.attn(hidden,output)
        context_h2 = self.dec(context_h)
        output = self.ff1(torch.cat((context_h2,sf),dim=1))
        output = self.softmax(output)
        return output

In [50]:
# HPs
hidden_size = 70
hidden_size2 = 50
batch_size = 2 # OOM when batch_size >= 4
feat_size = sf_train.shape[1]
lr = 0.0002

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# Build and train
'''
RNN_Attn_StatFeat args: input_size,feat_size,hidden_size,hidden_size2,output_size
'''
rnn_a_sf = RNN_Attn_StatFeat(input_size=vec_size,feat_size=feat_size,
                             hidden_size=hidden_size,hidden_size2=hidden_size2,
                             output_size=y_train.shape[1]).to(device)
optimizer = optim.Adam(rnn_a_sf.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train.to(device),sf_train.to(device),y_train.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn_a_sf = train_and_save(rnn_a_sf,'RNN_Attn_sf.pt',
                     loss_fn,optimizer,
                     loader,y_val_nn.to(device),y_val,sf=True,X_val=X_val.to(device),sf_val=sf_val.to(device))

  0%|          | 0/40 [00:00<?, ?it/s]

[(0, 1617), (1, 7)]
Epoch1: val loss=0.946342945098877, balanced accuracy=0.3333333333333333, precision=0.3451261382295865, recall=0.5862068965517241, fscore=0.4344639669266185
[(0, 1440), (1, 183), (2, 1)]
Epoch2: val loss=0.9002696871757507, balanced accuracy=0.42468654128317995, precision=0.577262342189561, recall=0.6483990147783252, fscore=0.5685871006804291
[(0, 1329), (1, 294), (2, 1)]
Epoch3: val loss=0.8863833546638489, balanced accuracy=0.4574329731892757, precision=0.5705987501250988, recall=0.6631773399014779, fscore=0.5981036318441268
[(0, 1151), (1, 297), (2, 176)]
Epoch4: val loss=0.8215010166168213, balanced accuracy=0.6578262618178584, precision=0.7356138425584221, recall=0.7376847290640394, fscore=0.7245454724733096
[(0, 1023), (1, 428), (2, 173)]
Epoch5: val loss=0.7423332333564758, balanced accuracy=0.7528385091410302, precision=0.8094902690683033, recall=0.8084975369458128, fscore=0.8060462515042427
[(0, 948), (1, 494), (2, 182)]
Epoch6: val loss=0.7397065162658691,

#### RNN_Attn_StatFeat with Section Header

##### For Subsequent Runs (Tensors Saved), Ignore this Cell:

In [30]:
# Generic inclusion
df_train, df_test = add_sectionName(df_train), add_sectionName(df_test)
df_train = df_train.loc[df_train['tokens_lower'].str.len() <= 100]
df_train = df_train.reset_index(drop=True)
corpus_sn = df_train['tokens_lower_sn'].tolist()
ft_sn = FastText(corpus_sn,vector_size=vec_size,epochs=10)
# unchanged: sf_core, y_core. Note:
    # sf_core is not a tensor (sf_train and sf_val definitely are, but not sf_train_sn and sf_val_sn)
    # sf_test and y_test no change, as no changes to features and not implicated by the repeat of train-val splits
X_core_sn = generate_X(df_train,'tokens_lower_sn',
                   ft,seq_length,vec_size)
X_test_sn = generate_X(df_test,'tokens_lower_sn',
                   ft,seq_length,vec_size)
X_train_sn, X_val_sn, sf_train_sn, sf_val_sn, y_train_sn, y_val_sn = train_test_split(X_core_sn,sf_core,y_core,test_size=0.2,random_state= 1)
y_train_sn, y_val_sn_nn = convert_y(y_train_sn),convert_y(y_val_sn)
if torch.is_tensor(sf_test):
    X_train_sn, X_val_sn, X_test_sn, sf_train_sn, sf_val_sn = to_tensor(X_train_sn), to_tensor(X_val_sn), to_tensor(X_test_sn), to_tensor(sf_train_sn.toarray()), to_tensor(sf_val_sn.toarray())
else:
    X_train_sn, X_val_sn, X_test_sn, sf_train_sn, sf_val_sn, sf_test = to_tensor(X_train_sn), to_tensor(X_val_sn), to_tensor(X_test_sn), to_tensor(sf_train_sn.toarray()), to_tensor(sf_val_sn.toarray()), to_tensor(sf_test.toarray())
y_train_sn, y_val_sn_nn = to_tensor(y_train_sn), to_tensor(y_val_sn_nn)
torch.save([X_train_sn,X_val_sn,X_test_sn,sf_train_sn,sf_val_sn,sf_test,y_train_sn,y_val_sn_nn],'data_arrays_sn_v0.pt') # embeddings only
np.savez('y_arr_sn_v0.npz',y_val=y_val_sn,y_test=y_test)

100%|██████████| 8117/8117 [00:16<00:00, 485.72it/s]
100%|██████████| 1861/1861 [00:01<00:00, 961.34it/s] 


##### For Subsequent Runs (Tensors Saved), Resume from this Cell:

In [31]:
X_train_sn,X_val_sn,X_test_sn,sf_train_sn,sf_val_sn,sf_test,y_train_sn,y_val_sn_nn = torch.load('data_arrays_sn_v0.pt')
data_sn = np.load('y_arr_sn_v0.npz')
y_val_sn = data_sn['y_val']
y_test = data_sn['y_test']

In [34]:
# HPs
hidden_size = 70
hidden_size2 = 50
batch_size = 2 # OOM when batch_size >= 4
feat_size = sf_train_sn.shape[1]
lr = 0.0002

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# Build and train
'''
RNN_Attn_StatFeat args: input_size,feat_size,hidden_size,hidden_size2,output_size
'''
rnn_a_sf1 = RNN_Attn_StatFeat(input_size=vec_size,feat_size=feat_size,
                             hidden_size=hidden_size,hidden_size2=hidden_size2,
                             output_size=y_train_sn.shape[1]).to(device)
optimizer = optim.Adam(rnn_a_sf1.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train_sn.to(device),sf_train_sn.to(device),y_train_sn.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn_a_sf1 = train_and_save(rnn_a_sf1,'RNN_Attn_sf_v1.pt',
                     loss_fn,optimizer,
                     loader,y_val_sn_nn.to(device),y_val_sn,sf=True,X_val=X_val_sn.to(device),sf_val=sf_val_sn.to(device))

  0%|          | 0/40 [00:00<?, ?it/s]

  2%|▎         | 1/40 [00:32<21:04, 32.43s/it]

[(0, 1149), (1, 475)]
Epoch1: val loss=0.8001357316970825, balanced accuracy=0.5732848695033569, precision=0.6441212748717025, recall=0.750615763546798, fscore=0.6928264623775358


  5%|▌         | 2/40 [01:02<19:49, 31.29s/it]

[(0, 1147), (1, 477)]
Epoch2: val loss=0.7956319451332092, balanced accuracy=0.5743908674580943, precision=0.6450996035597374, recall=0.7518472906403941, fscore=0.6939665238703127


  8%|▊         | 3/40 [01:59<26:24, 42.81s/it]

[(0, 1183), (1, 441)]
Epoch3: val loss=0.7899011969566345, balanced accuracy=0.5727290916366546, precision=0.6512045646661031, recall=0.7567733990147784, fscore=0.6982778809658402


 10%|█         | 4/40 [02:50<27:31, 45.89s/it]

[(0, 1153), (1, 470), (2, 1)]
Epoch4: val loss=0.7921645641326904, balanced accuracy=0.5757914276821839, precision=0.6479180440319966, recall=0.7543103448275862, fscore=0.6964723977264257


 12%|█▎        | 5/40 [03:25<24:30, 42.01s/it]

[(0, 1146), (1, 477), (2, 1)]
Epoch5: val loss=0.7927519083023071, balanced accuracy=0.5736905873460495, precision=0.6444596406726004, recall=0.750615763546798, fscore=0.6930775696097446


 15%|█▌        | 6/40 [04:04<23:16, 41.07s/it]

[(0, 1199), (1, 424), (2, 1)]
Epoch6: val loss=0.7916719913482666, balanced accuracy=0.5691332088390912, precision=0.6523632572660041, recall=0.7561576354679803, fscore=0.6977316593595723


 18%|█▊        | 7/40 [04:43<22:12, 40.38s/it]

[(0, 1197), (1, 426), (2, 1)]
Epoch7: val loss=0.7899195551872253, balanced accuracy=0.5750689164554711, precision=0.6578384655584241, recall=0.7623152709359606, fscore=0.703624550956153


 20%|██        | 8/40 [05:20<20:56, 39.28s/it]

[(0, 1014), (1, 440), (2, 170)]
Epoch8: val loss=0.7084484100341797, balanced accuracy=0.7910997732426304, precision=0.8467024116272245, recall=0.8448275862068966, fscore=0.8426619667663816


 22%|██▎       | 9/40 [05:58<20:11, 39.07s/it]

[(0, 1012), (1, 427), (2, 185)]
Epoch9: val loss=0.700186014175415, balanced accuracy=0.8093974963722862, precision=0.8543016746349507, recall=0.853448275862069, fscore=0.8519193146592459


 25%|██▌       | 10/40 [06:35<19:08, 38.28s/it]

[(0, 991), (1, 455), (2, 178)]
Epoch10: val loss=0.6967411637306213, balanced accuracy=0.8109319485369905, precision=0.8563989369590223, recall=0.854064039408867, fscore=0.8530201056332176


 28%|██▊       | 11/40 [07:09<17:53, 37.01s/it]

[(0, 1002), (1, 436), (2, 186)]
Epoch11: val loss=0.6950381398200989, balanced accuracy=0.8174926536271073, precision=0.8584395071589022, recall=0.8571428571428571, fscore=0.8560683070382221


 28%|██▊       | 11/40 [07:37<20:05, 41.58s/it]


KeyboardInterrupt: 

#### Word Order
Based on RNN_Attn_StatFeat with sectionName

##### For Subsequent Runs (Tensors Saved), Ignore this Cell:

In [76]:
# Generic inclusion
def randomize_tokens(tk_list,rng):
    '''
    Based on tokens with sectionName
    Keep positions of: sectionName bos, eos
    '''
    target_list = tk_list[2:-1]
    reordered_list = tk_list[:2] + rng.permutation(target_list).tolist() + [tk_list[-1]]
    return reordered_list

def generate_reordered_tokens(df):
    rng1 = np.random.default_rng()
    df['tokens_lower_sn_jumbled'] = df['tokens_lower_sn'].apply(lambda x:randomize_tokens(x,rng1))
    return df

def randomize_ngrams(tk_list, rng, n):
    target_list = tk_list[2:-1]
    ngrams = [target_list[i:i + n] for i in range(0, len(target_list), n)]
    last_ngram = ngrams[-1]
    last_ngram.extend(['[BUF]']*(n-len(last_ngram))) # ensure last ngram same size as others for rng to work
    reordered_ngrams = rng.permutation(ngrams).tolist()
    flattened_ngrams = [token for ngram in reordered_ngrams for token in ngram]
    filtered_list = list(filter(lambda x: x != 'BUF', flattened_ngrams))
    reordered_list = tk_list[:2] + filtered_list + [tk_list[-1]]
    return reordered_list

def generate_ngrams(df, n=5):
    rng1 = np.random.default_rng()
    df['tokens_lower_sn_ngrams'] = df['tokens_lower_sn'].apply(lambda x:randomize_ngrams(x, rng1, n))
    return df

# Regenerate col again (in case the cell above hasn't been run)
df_train, df_test = add_sectionName(df_train), add_sectionName(df_test)
df_train = df_train.loc[df_train['tokens_lower'].str.len() <= 100]
df_train = df_train.reset_index(drop=True)
# df_train, df_test = generate_reordered_tokens(df_train), generate_reordered_tokens(df_test)
print(len(df_train))
df_train, df_test = generate_ngrams(df_train, n=len(df_train)//5), generate_ngrams(df_test, n=len(df_test)//5)
# unchanged: sf_core, y_core. Note:
    # sf_core is not a tensor (sf_train and sf_val definitely are, but not sf_train_sn and sf_val_sn)
    # y_test has already been loaded
    # sf_test and y_test no change, as no changes to features and not implicated by the repeat of train-val splits
X_core_sn_ro = generate_X(df_train,'tokens_lower_sn',
                   ft,seq_length,vec_size)
X_test_sn_ro = generate_X(df_test,'tokens_lower_sn',
                   ft,seq_length,vec_size)
X_train_sn_ro, X_val_sn_ro, sf_train_sn_ro, sf_val_sn_ro, y_train_sn_ro, y_val_sn_ro = train_test_split(X_core_sn_ro,sf_core,y_core,test_size=0.2,random_state= 1)
y_train_sn_ro, y_val_sn_ro_nn = convert_y(y_train_sn_ro),convert_y(y_val_sn_ro)
if torch.is_tensor(sf_test): # Depends on whether sf_test has been loaded as or converted to tensor previously
    X_train_sn_ro, X_val_sn_ro, X_test_sn_ro, sf_train_sn_ro, sf_val_sn_ro = to_tensor(X_train_sn_ro), to_tensor(X_val_sn_ro), to_tensor(X_test_sn_ro), to_tensor(sf_train_sn_ro.toarray()), to_tensor(sf_val_sn_ro.toarray())
else:
    X_train_sn_ro, X_val_sn_ro, X_test_sn_ro, sf_train_sn_ro, sf_val_sn_ro, sf_test = to_tensor(X_train_sn_ro), to_tensor(X_val_sn_ro), to_tensor(X_test_sn_ro), to_tensor(sf_train_sn_ro.toarray()), to_tensor(sf_val_sn_ro.toarray()), to_tensor(sf_test.toarray())
y_train_sn_ro, y_val_sn_ro_nn = to_tensor(y_train_sn_ro), to_tensor(y_val_sn_ro_nn)
torch.save([X_train_sn_ro,X_val_sn_ro,X_test_sn_ro,sf_train_sn_ro,sf_val_sn_ro,sf_test,y_train_sn_ro,y_val_sn_ro_nn],'data_arrays_sn_ro_v0.pt') # embeddings only
np.savez('y_arr_sn_ro_v0.npz',y_val=y_val_sn_ro,y_test=y_test)

8117


100%|██████████| 8117/8117 [00:11<00:00, 719.97it/s] 
100%|██████████| 1861/1861 [00:03<00:00, 533.54it/s]


##### For Subsequent Runs (Tensors Saved), Resume from this Cell:

In [77]:
X_train_sn_ro,X_val_sn_ro,X_test_sn_ro,sf_train_sn_ro,sf_val_sn_ro,sf_test,y_train_sn_ro,y_val_sn_ro_nn = torch.load('data_arrays_sn_ro_v0.pt')
data_sn = np.load('y_arr_sn_ro_v0.npz')
y_val_sn_ro = data_sn['y_val']
y_test = data_sn['y_test']


In [78]:
# HPs
hidden_size = 70
hidden_size2 = 50
batch_size = 2 # OOM when batch_size >= 4
feat_size = sf_train_sn_ro.shape[1]
lr = 0.0002

# Save memory
for var in models_generated:
    if var in globals(): del var
gc.collect()

# Build and train
'''
RNN_Attn_StatFeat args: input_size,feat_size,hidden_size,hidden_size2,output_size
'''
rnn_a_sf2 = RNN_Attn_StatFeat(input_size=vec_size,feat_size=feat_size,
                             hidden_size=hidden_size,hidden_size2=hidden_size2,
                             output_size=y_train_sn_ro.shape[1]).to(device)
optimizer = optim.Adam(rnn_a_sf2.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train_sn_ro.to(device),sf_train_sn_ro.to(device),y_train_sn_ro.to(device)),batch_size=batch_size)
'''
train_and_save args: model,filename,
                   loss_fn,optimizer,
                   loader,
                   y_val_nn,y_val,n_epochs=40,sf=False,**kwargs
'''
rnn_a_sf2 = train_and_save(rnn_a_sf2,'RNN_Attn_sf_v2.pt',
                     loss_fn,optimizer,
                     loader,y_val_sn_ro_nn.to(device),y_val_sn_ro,sf=True,X_val=X_val_sn_ro.to(device),sf_val=sf_val_sn_ro.to(device))

  0%|          | 0/40 [00:00<?, ?it/s]

  2%|▎         | 1/40 [00:36<23:29, 36.14s/it]

[(0, 1621), (1, 3)]
Epoch1: val loss=0.9522787928581238, balanced accuracy=0.33484504913076346, precision=0.52530898338616, recall=0.5874384236453202, fscore=0.4362349587750636


  5%|▌         | 2/40 [01:11<22:43, 35.88s/it]

[(0, 1425), (1, 199)]
Epoch2: val loss=0.9170416593551636, balanced accuracy=0.43218954248366015, precision=0.5778227647614965, recall=0.6551724137931034, fscore=0.5770383300179159


  8%|▊         | 3/40 [01:44<21:14, 34.44s/it]

[(0, 1355), (1, 269)]
Epoch3: val loss=0.8779624104499817, balanced accuracy=0.46052865590680714, precision=0.5839049989333376, recall=0.6693349753694581, fscore=0.6029886920584837


 10%|█         | 4/40 [02:17<20:14, 33.75s/it]

[(0, 1187), (1, 249), (2, 188)]
Epoch4: val loss=0.7965524792671204, balanced accuracy=0.673834180136701, precision=0.7567907746528606, recall=0.7536945812807881, fscore=0.7358628590470336


 12%|█▎        | 5/40 [02:52<19:59, 34.28s/it]

[(0, 951), (1, 495), (2, 178)]
Epoch5: val loss=0.7290163636207581, balanced accuracy=0.7817404739673647, precision=0.8273126238039697, recall=0.8226600985221675, fscore=0.8224611618213806


 15%|█▌        | 6/40 [03:29<20:01, 35.33s/it]

[(0, 980), (1, 449), (2, 195)]
Epoch6: val loss=0.7077786326408386, balanced accuracy=0.8081798375916023, precision=0.8453236933889068, recall=0.8448275862068966, fscore=0.8441331919214358


 18%|█▊        | 7/40 [04:03<19:10, 34.85s/it]

[(0, 929), (1, 502), (2, 193)]
Epoch7: val loss=0.7107329964637756, balanced accuracy=0.8152043645741124, precision=0.8453462016024249, recall=0.8392857142857143, fscore=0.840411200937267


 20%|██        | 8/40 [04:36<18:11, 34.10s/it]

[(0, 974), (1, 444), (2, 206)]
Epoch8: val loss=0.6967216730117798, balanced accuracy=0.827639136462666, precision=0.8544053850236907, recall=0.854064039408867, fscore=0.8537732354500772


 22%|██▎       | 9/40 [05:09<17:31, 33.91s/it]

[(0, 979), (1, 429), (2, 216)]
Epoch9: val loss=0.6910523176193237, balanced accuracy=0.8405412670118553, precision=0.8628149114757987, recall=0.8633004926108374, fscore=0.86277746475446


 25%|██▌       | 10/40 [05:43<16:53, 33.79s/it]

[(0, 996), (1, 410), (2, 218)]
Epoch10: val loss=0.6881459951400757, balanced accuracy=0.8441179502103872, precision=0.8669095313205506, recall=0.8676108374384236, fscore=0.8665988454698561


 28%|██▊       | 11/40 [06:19<16:40, 34.52s/it]

[(0, 986), (1, 420), (2, 218)]
Epoch11: val loss=0.6857755184173584, balanced accuracy=0.8483585353333253, precision=0.869469708952863, recall=0.8700738916256158, fscore=0.8693755934301448


 30%|███       | 12/40 [06:59<16:52, 36.15s/it]

[(0, 989), (1, 410), (2, 225)]
Epoch12: val loss=0.6884109377861023, balanced accuracy=0.8408423975650866, precision=0.861104264232036, recall=0.8620689655172413, fscore=0.8610879341590677


 32%|███▎      | 13/40 [07:37<16:32, 36.76s/it]

[(0, 1004), (1, 407), (2, 213)]
Epoch13: val loss=0.6836774349212646, balanced accuracy=0.8451946435139712, precision=0.8715423421223824, recall=0.8719211822660099, fscore=0.8708014492533174


 35%|███▌      | 14/40 [08:15<16:06, 37.18s/it]

[(0, 984), (1, 425), (2, 215)]
Epoch14: val loss=0.6831976175308228, balanced accuracy=0.8520822470402303, precision=0.8733798629863274, recall=0.8737684729064039, fscore=0.8731948017665765


 38%|███▊      | 15/40 [08:52<15:25, 37.01s/it]

[(0, 1028), (1, 393), (2, 203)]
Epoch15: val loss=0.6813697218894958, balanced accuracy=0.8372273151684917, precision=0.8719812851545262, recall=0.8713054187192119, fscore=0.8696226365483174


 40%|████      | 16/40 [09:28<14:44, 36.87s/it]

[(0, 992), (1, 455), (2, 177)]
Epoch16: val loss=0.6902587413787842, balanced accuracy=0.8202771007392856, precision=0.8616532946936549, recall=0.8577586206896551, fscore=0.857306512496974


 42%|████▎     | 17/40 [10:08<14:24, 37.61s/it]

[(0, 1015), (1, 419), (2, 190)]
Epoch17: val loss=0.6842361092567444, balanced accuracy=0.828634484096669, precision=0.8665861909362926, recall=0.8651477832512315, fscore=0.8640210709519502


 45%|████▌     | 18/40 [10:55<14:51, 40.51s/it]

[(0, 1053), (1, 387), (2, 184)]
Epoch18: val loss=0.6849590539932251, balanced accuracy=0.8214689916370589, precision=0.8697786233680351, recall=0.8663793103448276, fscore=0.8642303463312061


 48%|████▊     | 19/40 [11:36<14:16, 40.81s/it]

[(0, 1005), (1, 414), (2, 205)]
Epoch19: val loss=0.6799502968788147, balanced accuracy=0.8408525026172086, precision=0.8689437429992684, recall=0.8688423645320197, fscore=0.8678470579981464


 50%|█████     | 20/40 [12:12<13:03, 39.20s/it]

[(0, 1031), (1, 410), (2, 183)]
Epoch20: val loss=0.685183048248291, balanced accuracy=0.8211708925994641, precision=0.8647301467720212, recall=0.8620689655172413, fscore=0.8606240103769516


 52%|█████▎    | 21/40 [12:48<12:07, 38.27s/it]

[(0, 1067), (1, 383), (2, 174)]
Epoch21: val loss=0.6900904774665833, balanced accuracy=0.8058086871112081, precision=0.8639091704574781, recall=0.8589901477832512, fscore=0.856278406946492


 55%|█████▌    | 22/40 [13:27<11:34, 38.58s/it]

[(0, 1004), (1, 438), (2, 182)]
Epoch22: val loss=0.6891644597053528, balanced accuracy=0.8184480863052291, precision=0.8585789300091985, recall=0.8559113300492611, fscore=0.8551638634986899


 57%|█████▊    | 23/40 [14:11<11:20, 40.05s/it]

[(0, 1044), (1, 407), (2, 173)]
Epoch23: val loss=0.6907511949539185, balanced accuracy=0.8084915784495617, precision=0.8626643186312725, recall=0.8589901477832512, fscore=0.8568911802666381


 60%|██████    | 24/40 [14:47<10:22, 38.90s/it]

[(0, 1057), (1, 396), (2, 171)]
Epoch24: val loss=0.6887561082839966, balanced accuracy=0.8046698477370745, precision=0.8623716442229741, recall=0.8577586206896551, fscore=0.8553329031913185


 62%|██████▎   | 25/40 [15:19<09:13, 36.90s/it]

[(0, 1066), (1, 376), (2, 182)]
Epoch25: val loss=0.6914238333702087, balanced accuracy=0.8073825489791876, precision=0.8609184308345948, recall=0.8571428571428571, fscore=0.8542941065519691


 65%|██████▌   | 26/40 [16:02<09:02, 38.72s/it]

[(0, 1073), (1, 395), (2, 156)]
Epoch26: val loss=0.7031577825546265, balanced accuracy=0.7799326801427641, precision=0.8519535841718596, recall=0.8454433497536946, fscore=0.8418384883363288


 68%|██████▊   | 27/40 [17:31<11:39, 53.83s/it]

[(0, 1035), (1, 414), (2, 175)]
Epoch27: val loss=0.6921879649162292, balanced accuracy=0.8060537346251632, precision=0.8586225934651722, recall=0.8559113300492611, fscore=0.8538838514193922


 70%|███████   | 28/40 [18:11<09:54, 49.58s/it]

[(0, 1032), (1, 427), (2, 165)]
Epoch28: val loss=0.697123646736145, balanced accuracy=0.8012174566796416, precision=0.8580630672082945, recall=0.853448275862069, fscore=0.8515480163144233


 70%|███████   | 28/40 [18:41<08:00, 40.07s/it]

[(0, 982), (1, 457), (2, 185)]
Epoch29: val loss=0.6982401609420776, balanced accuracy=0.8136835542297728, precision=0.8512005186494405, recall=0.8485221674876847, fscore=0.8482130875705938
Early stopping triggered at epoch 29. Best model is from epoch 19.





#### Other Hyperparam-like Variables

In [None]:
'''
To test:
- 1 more hidden layer for RNN_Attn_StatFeat?
'''
no_print=True

In [70]:
'''
Results:
- RNN_Attn - Epoch13: val loss=0.45884597301483154, balanced accuracy=0.755593651602055, precision=0.8039308068696814, recall=0.8060344827586207, fscore=0.8028471981210324
- RNN_Attn_StatFeat - Epoch29: val loss=0.6968145370483398, balanced accuracy=0.819888056232594, precision=0.8485338182885594, recall=0.8497536945812808, fscore=0.8471742309103825
- RNN_Attn_StatFeat with sectionName - Epoch26: val loss=0.6687561273574829, balanced accuracy=0.8488238729835368, precision=0.8824253380976532, recall=0.8811576354679803, fscore=0.879580416187504
- RNN_Attn_StatFeat with sectionName, randomized ngrams - Epoch40: val loss=0.6920244693756104, balanced accuracy=0.8208106474913198, precision=0.8647667864909244, recall=0.8608374384236454, fscore=0.8600928393653015
- RNN_Attn_StatFeat with sectionName, randomized ngrams, no test data randomization - Epoch24: val loss=0.6812471151351929, balanced accuracy=0.8263280059498547, precision=0.8717889618210934, recall=0.8682266009852216, fscore=0.8665472144701037
- RNN_Attn_StatFeat with sectionName, randomized ngrams, n/5-sized ngrams - Epoch19: val loss=0.6799502968788147, balanced accuracy=0.8408525026172086, precision=0.8689437429992684, recall=0.8688423645320197, fscore=0.8678470579981464



Observations:
- Results (esp for RNN_Attn) are worse with softmax than without

Actions taken to address problem of assigning all samples to one class:
- Removed outlier samples with extremely high word counts (>100) (worked for Attn. RNN unaffected)
- Resampling (tried for RNN; didn't work)
- Weighted loss (tried for RNN; didn't work)
'''
no_print = True

### Archives

In [None]:
'''
pytorch lightning, lightning fabric
'''
no_print=True