### Init

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support,balanced_accuracy_score
from gensim.models import Word2Vec,FastText
from nltk.tokenize import MWETokenizer,word_tokenize
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from tqdm.auto import tqdm
import copy
import gc
import os
import time
import warnings

path = r"C:\YZC\NUS\Semester 2\CS4248 Natural Language Processing\Project\scicite"
os.chdir(path)
if torch.cuda.is_available():
    device = 'cuda'
else: device = 'cpu'

#### Load data

In [2]:
df = pd.read_json('train.jsonl',lines=True)
df_test = pd.read_json('test.jsonl',lines=True)
df.head()

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence
0,explicit,175.0,Introduction,168.0,"However, how frataxin interacts with the Fe-S ...",background,1.0,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,,
1,explicit,36.0,Novel Quantitative Trait Loci for Seminal Root...,16.0,"In the study by Hickey et al. (2012), spikes w...",background,1.0,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,,
2,explicit,228.0,Introduction,225.0,"The drug also reduces catecholamine secretion,...",background,1.0,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,,
3,explicit,110.0,Discussion,46.0,By clustering with lowly aggressive close kin ...,background,1.0,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,,
4,explicit,239.0,Discussion,234.0,Ophthalmic symptoms are rare manifestations of...,background,1.0,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,,


In [None]:
'''
Questions:
excerpt_index?
citeStart, citeEnd?
label2 (supportiveness)?
isKeyCitation?
'''
no_print = True

### Preprocessing

In [3]:
def null_test():
    null_cols = []
    for col in df.columns:
        if df[col].isnull().any(): null_cols.append(col)
    print(null_cols)

#### Processing functions

In [4]:
tk = MWETokenizer([('<','bos','>'),('<','eos','>')],separator='')
# tk.add_mwe([('<','bos','>'),('<','eos','>')])

def convert_label(value):
    if value == 'background':
        label = 0
    elif value == 'method':
        label = 1
    elif value == 'result':
        label = 2
    return label

def NA_impute(df):
    df['label2'] = df['label2'].fillna('cant_determine')
    try:
        print(len(df).df.columns)
        df = df.drop(columns='label2_confidence',axis=1)
        df_type = 'train'
    except:
        df_type = 'test'
    df['label_confidence'] = df['label_confidence'].fillna(df['label_confidence'].mean())
    df['citeStart'] = df['citeStart'].fillna(df['citeStart'].mean().astype(np.int64))
    df['citeEnd'] = df['citeEnd'].fillna(df['citeEnd'].mean().astype(np.int64))
    df['source'] = df['source'].fillna('unknown')
    df['sectionName'] = df['sectionName'].fillna('unknown')
    return df, df_type

def process_df(df):
    df, df_type = NA_impute(df)
    for col in ['citeStart','citeEnd']:
        df[col] = df[col].astype('int64')
    feature_cols = ['source', 'citeEnd', 'sectionName', 'citeStart', 'label_confidence', 'citingPaperId', 'citedPaperId', 'isKeyCitation', 'excerpt_index', 'label2', 'label2_confidence']
    if df_type == 'test':
        feature_cols.remove('label2_confidence')
    df['edited_string'] = ''
    for col in feature_cols:
        df['edited_string'] += col + ': ' + df[col].astype(str) + '[SEP]'
    df['edited_string'] += df['string']
    df['tagged_string'] = '<BOS>' + df['string'] + '<EOS>'
    df['label_num'] = df['label'].apply(lambda x: convert_label(x))
    df['string_lower'] = df['tagged_string'].apply(lambda x: x.lower())
    df['tokens_lower'] = df['string_lower'].apply(lambda x: tk.tokenize(word_tokenize(x)))  
    return df

In [17]:
df_train = process_df(df)
df_test = process_df(df_test)

In [18]:
# Remove outliers - KIV
df_train = df_train.loc[df_train['tokens_lower'].str.len() <= 100]
df_train = df_train.reset_index(drop=True)

### RNN

In [None]:
'''
Hypotheses:
Word embeddings
Combine word embeddings with features
Attention mechanism
Include the sentences immediately before and after
Activation functions

Open questions
- Where can we get more contextual information? How to incorporate?
- variables:
    - lower case? stopwords? lemmatization? any words to include?
'''
no_print = True

In [None]:
'''
Generate word_embeddings (ok)
Convert labels (ok)
Generate other features?
'''

#### Preparing Arrays

In [22]:
'''
input array (batch_first = True): N (batch size) * L (sequence length) * H_in (input size)
'''
def generate_X(df,token_col,model,seq_len,vec_size):
    '''
    token_col: col for tokens in dataframe
    model: word vectorization model
    '''
    X = []
    for i in tqdm(range(len(df))):
        i_arr = [model.wv[token] for token in df.at[i,token_col]]
        if len(i_arr) < seq_len:
            while len(i_arr) < seq_len:
                i_arr.append(np.zeros(vec_size))
        elif len(i_arr) > seq_len:
            i_arr = i_arr[:seq_len]
        X.append(i_arr)
    return np.array(X)

def convert_y(y): # for training and validation labels. For y_val, keep one multi-target and one single-target version
    y_nn = np.zeros([y.shape[0],3])
    for i in range(len(y)):
        if y[i] == 0:
            y_nn[i,0] = 1
        elif y[i] == 1:
            y_nn[i,1] = 1
        elif y[i] == 2: 
            y_nn[i,2] = 1
    return y_nn

def to_tensor(arr,dtype='Float'):
    if dtype == "Float":
        arr = torch.from_numpy(arr).type(torch.FloatTensor)
    elif dtype == "Long":
        arr = torch.from_numpy(arr).type(torch.LongTensor)
    return arr

def check_dist_y(y,n=None):
    '''
    y: torch array
    n: last index of sample
    '''
    if n == None:
        print(torch.sum(y_train,dim=0))
    else:
        print(torch.sum(y_train[:n],dim=0))

In [45]:
vec_size = 200 # hp
seq_length = 100 # max word length based on tokenization, including padding tokens. Outliers removed (~ 1.5% of dataset)
corpus = df_train['tokens_lower'].tolist()
ft = FastText(corpus,vector_size=vec_size,epochs=10)

In [46]:
X_core = generate_X(df_train,'tokens_lower',
                   ft,seq_length,vec_size)
X_test = generate_X(df_test,'tokens_lower',
                   ft,seq_length,vec_size)
y_core = df_train['label_num'].to_numpy()
y_test = df_test['label_num'].to_numpy()
X_train, X_val, y_train, y_val = train_test_split(X_core,y_core,test_size=0.2,random_state= 1)
y_train, y_val_nn = convert_y(y_train),convert_y(y_val)
X_train, X_val, X_test, y_train, y_val_nn = to_tensor(X_train), to_tensor(X_val), to_tensor(X_test), to_tensor(y_train), to_tensor(y_val_nn)

  0%|          | 0/8117 [00:00<?, ?it/s]

  0%|          | 0/1861 [00:00<?, ?it/s]

In [71]:
torch.save([X_train,X_val,X_test,y_train,y_val_nn],'data_arrays_v0.pt') # embeddings only
np.savez('y_arr_v0.npz',y_val=y_val,y_test=y_test)

#### **[Note] Start from this Cell for Subsequent Runs:**

In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support,balanced_accuracy_score
from gensim.models import Word2Vec,FastText
from nltk.tokenize import MWETokenizer,word_tokenize
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from tqdm.auto import tqdm
import copy
import gc
import os
import time
import warnings

path = r"C:\YZC\NUS\Semester 2\CS4248 Natural Language Processing\Project\scicite"
os.chdir(path)
if torch.cuda.is_available():
    device = 'cuda'
else: device = 'cpu'

In [None]:
X_train,X_val,X_test,y_train,y_val_nn = torch.load('data_arrays_v0.pt')
data = np.load('y_arr_v0.npz')
y_val = data['y_val']
y_test = data['y_test']
vec_size = 300 # hp
seq_length = 100 # max word length based on tokenization, including padding tokens

In [54]:
def check_dist(arr):
    unique, counts = np.unique(arr,return_counts=True)
    return list(zip(unique,counts))
    
def resample_arr(X,y,rng):
    y_0 = y[:,0] == 1 # class 0
    y_non0 =  y[:,0] == 0 # class 1 or 2
    idx_0 = y_0.nonzero().numpy() # list of lists (numpy)
    idx_non0 = y_non0.nonzero() # list of lists (torch)
    idx_sample_0 = rng.choice(idx_0,len(idx_0)//3) # as size of class 0 ~ 4x that of other classes
    idx_sample_0 = torch.from_numpy(idx_sample_0).type(torch.LongTensor)
    idx_sample = torch.cat((idx_sample_0.squeeze(1),idx_non0.squeeze(1))) 
    del y_0,y_non0,idx_0,idx_non0,idx_sample_0
    return X[idx_sample], y[idx_sample]

def train_and_save(model,filename,
                   loss_fn,optimizer,
                   loader,
                   X_val,y_val_nn,y_val,n_epochs=40):
    best_loss = float('inf')
    best_epoch = 0
    for epoch in tqdm(range(n_epochs)):
        model.train()
        rng = np.random.default_rng()

        # Data resampling to offset class imbalance
#         X_sample, y_sample = resample_arr(X_train,y_train,rng)
#         check_dist(y_sample.numpy()[:,0]);check_dist(y_sample.numpy()[:,1]); check_dist(y_sample.numpy()[:,2]) # Troubleshooting
#         loader = DataLoader(TensorDataset(X_sample.to(device),y_sample.to(device)),batch_size=batch_size)

        for X_batch, y_batch in loader:
            y_pred_batch = model(X_batch)
            loss = loss_fn(y_pred_batch,y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            y_pred_nn = model(X_val)
            y_pred = torch.argmax(y_pred_nn,1).detach().cpu()
            y_dist = check_dist(y_pred) # Troubleshooting 
            print(y_dist) # Troubleshooting
            bal_accuracy = balanced_accuracy_score(y_val,y_pred)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                precision, recall, fscore, support = precision_recall_fscore_support(y_val,y_pred,average='weighted')
            val_loss = loss_fn(y_pred_nn,y_val_nn)
            tqdm.write(f"Epoch{epoch+1}: val loss={val_loss}, balanced accuracy={bal_accuracy}, precision={precision}, recall={recall}, fscore={fscore}")
            
            # Early Stopping
            if val_loss < best_loss:
                best_loss = val_loss
                best_epoch = epoch + 1
                best_model_weights = copy.deepcopy(model.state_dict())
                patience = 10
            else:
                patience -= 1
                if patience == 0:
                    print(f'Early stopping triggered at epoch {epoch+1}. Best model is from epoch {best_epoch}.')
                    break
        
        # For memory saving
        try:
            del X_sample,y_sample
            gc.collect()
        except:
            True
    torch.save(best_model_weights,filename)
    return model

#### Naive RNN

In [48]:
# See documentation: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

class RNN_base(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN_base,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.ff = nn.Linear(hidden_size,output_size)
    
    def forward(self,x):
        h0 = torch.zeros(1,x.size(0),self.hidden_size).to(device)
        # 1: number of RNN layers. x.size(0): batch_size.
        _, hidden = self.rnn(x,h0)
        output = self.ff(hidden[-1])
        return output

In [49]:
# For reference
class0 = check_dist(y_train[:,0])[1][1]
class1 = check_dist(y_train[:,1])[1][1]
class2 = check_dist(y_train[:,2])[1][1]
print(class0/len(y_train),class1/len(y_train),class2/len(y_train))

0.5898660095487448 0.2782997073771754 0.13183428307407977


In [67]:
# HPs
hidden_size = 150
batch_size = 2
lr = 0.0004

# Save memory
try:
    del rnn0
    gc.collect()
except:
    True

# # Build and run
# Using Module
rnn0 = RNN_base(input_size=vec_size,hidden_size=hidden_size,output_size=y_train.shape[1]).to(device)

# # Using Sequential -> TBD; for weight-pulling
# class generate_h(nn.Module):
#     def __init__(self,hidden_size):
#         super(generate_h,self).__init__()
#         self.hidden_size = hidden_size
#     def forward(self,x):
#         return torch.zeros(1,x.size(0),self.hidden_size).to(device)
# class generate

# rnn0 = nn.Sequential(
#     generate_h(hidden_size),
#     nn.RNN(input_size,hidden_size,batch_first=True),
#     nn.Linear(hidden_size,output_size)
# )

optimizer = optim.Adam(rnn0.parameters(),lr=lr)
weights = torch.tensor([1, 2, 2.5]).to(device) #hp
loss_fn = nn.CrossEntropyLoss(weight=weights)
loader = DataLoader(TensorDataset(X_train.to(device),y_train.to(device)),batch_size=batch_size)
rnn0 = train_and_save(rnn0,'RNN_v0.pt',
                      loss_fn,optimizer,
                      loader,
                      X_val.to(device),y_val_nn.to(device),y_val)

  0%|          | 0/40 [00:00<?, ?it/s]

[(0, 1624)]
Epoch1: val loss=1.5997633934020996, balanced accuracy=0.3333333333333333, precision=0.34363852556480373, recall=0.5862068965517241, fscore=0.43328335832083953
[(0, 777), (1, 847)]
Epoch2: val loss=1.602116346359253, balanced accuracy=0.34521586412342714, precision=0.43121543866057965, recall=0.43657635467980294, fscore=0.41947899857185345
[(0, 807), (1, 817)]
Epoch3: val loss=1.6013455390930176, balanced accuracy=0.34628295762749545, precision=0.4304754223676248, recall=0.44273399014778325, fscore=0.42452083498060317
[(0, 815), (1, 809)]
Epoch4: val loss=1.6110438108444214, balanced accuracy=0.3436652438753279, precision=0.4260496404030917, recall=0.44027093596059114, fscore=0.4217310852213961
[(0, 809), (1, 814), (2, 1)]
Epoch5: val loss=1.6050264835357666, balanced accuracy=0.34215352807789784, precision=0.4274954661020898, recall=0.43903940886699505, fscore=0.4215638727667801
[(0, 1623), (2, 1)]
Epoch6: val loss=1.6029331684112549, balanced accuracy=0.3333333333333333, 

KeyboardInterrupt: 

#### RNN with Attention

In [68]:
# Reference: https://github.com/mttk/rnn-classifier/blob/master/model.py

# Self attention
class Attention(nn.Module):
    def __init__(self):
        super(Attention,self).__init__()
    
    def forward(self,dec_hidden_state,enc_hidden_states):
        '''
        ('query': hidden from rnn) dec_hidden_states: 1 * batch_size * hidden_size
        ('key'/'value': output from rnn) enc_hidden_state: batch_size * seq_len * hidden_size
        '''
        attn_w = torch.bmm(dec_hidden_state.transpose(0,1),enc_hidden_states.transpose(1,2)) # (B*1*H,B*H*L)
        attn_w = torch.nn.functional.softmax(attn_w.squeeze(1),dim=1) # need to standardize with ^. (B*1*L -> B*L)
        context = torch.bmm(enc_hidden_states.transpose(1,2),attn_w.unsqueeze(2)).squeeze(2) # (B*H*L,B*L*1)
        return torch.cat((context,dec_hidden_state.squeeze(0)),dim=1) # batch_size * (2*hidden_size)
        
class RNN_Attn(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN_Attn,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.attn = Attention()
        self.dec = nn.Linear(2*hidden_size,output_size) # define context_size
        
    def forward(self,x):
        h0 = torch.zeros(1,x.size(0),self.hidden_size).to(device)
        # 1: number of RNN layers. x.size(0): batch_size.
        output, hidden = self.rnn(x,h0) # hidden: enc_hidden. output: dec_hidden
        c_h = self.attn(hidden,output)
        output = self.dec(c_h)
        return output

In [69]:
# HPs
hidden_size = 70
batch_size = 2 # OOM when batch_size >= 4
lr = 0.0002

# Build and run
try:
    del rnn0
    gc.collect()
except:
    True

rnn_a = RNN_Attn(input_size=vec_size,hidden_size=hidden_size,output_size=y_train.shape[1]).to(device)
optimizer = optim.Adam(rnn_a.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train.to(device),y_train.to(device)),batch_size=batch_size)
rnn_a = train_and_save(rnn_a,'RNN_Attn_v0.pt',
                     loss_fn,optimizer,
                     loader,X_val.to(device),y_val_nn.to(device),y_val)

  0%|          | 0/40 [00:00<?, ?it/s]

[(0, 938), (1, 474), (2, 212)]
Epoch1: val loss=0.5653529167175293, balanced accuracy=0.7284039878577694, precision=0.7708903609352612, recall=0.7697044334975369, fscore=0.7698472628980741
[(0, 1012), (1, 428), (2, 184)]
Epoch2: val loss=0.5234354734420776, balanced accuracy=0.7367851180876391, precision=0.7920420431776465, recall=0.7937192118226601, fscore=0.7910819673189762
[(0, 1022), (1, 417), (2, 185)]
Epoch3: val loss=0.5023232102394104, balanced accuracy=0.7470958080201777, precision=0.8014673023109354, recall=0.8029556650246306, fscore=0.800157423128351
[(0, 1019), (1, 405), (2, 200)]
Epoch4: val loss=0.4853055477142334, balanced accuracy=0.758420539933145, precision=0.8029498108731045, recall=0.8048029556650246, fscore=0.8023871071899555
[(0, 1033), (1, 395), (2, 196)]
Epoch5: val loss=0.4759170114994049, balanced accuracy=0.7572427961083422, precision=0.8068594362351644, recall=0.8084975369458128, fscore=0.8055238614792247
[(0, 1029), (1, 392), (2, 203)]
Epoch6: val loss=0.46

In [None]:
'''
(Reference) Getting model weights: 
https://stackoverflow.com/questions/44130851/simple-lstm-in-pytorch-with-sequential-module
https://discuss.pytorch.org/t/how-to-get-all-weights-of-rnn-in-pytorch/33794/2
'''
no_print = True

#### RNN_Attn with Static Features

Process Static Features:

In [72]:
df_train.columns

Index(['source', 'citeEnd', 'sectionName', 'citeStart', 'string', 'label',
       'label_confidence', 'citingPaperId', 'citedPaperId', 'isKeyCitation',
       'id', 'unique_id', 'excerpt_index', 'label2', 'label2_confidence',
       'edited_string', 'tagged_string', 'label_num', 'string_lower',
       'tokens_lower'],
      dtype='object')

In [79]:
'''
Feature Ideas: 
Include sectionName in text
Num: citeStart, citeEnd, excerpt_index, isKeyCitation
OHE: label2
'''
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].to_numpy()

num_cols = ['citeStart','citeEnd','excerpt_index','isKeyCitation']
cat_cols = ['label2']
num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_cols)),
    ('scaler',StandardScaler())
])
cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_cols)),
    ('ohe',OneHotEncoder())
])
combined_pipeline = FeatureUnion([
    ('num',num_pipeline),
    ('cat',cat_pipeline)
])
feat_train,feat_test = combined_pipeline.fit_transform(df_train),combined_pipeline.fit_transform(df_test)

In [82]:
temp

array(<8117x7 sparse matrix of type '<class 'numpy.float64'>'
	with 40585 stored elements in Compressed Sparse Row format>, dtype=object)

Build and Train Model:

In [68]:
class RNN_Attn_StatFeat(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN_Attn,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.attn = Attention()
        self.dec = nn.Linear(2*hidden_size,output_size) # define context_size
        
    def forward(self,x):
        h0 = torch.zeros(1,x.size(0),self.hidden_size).to(device)
        # 1: number of RNN layers. x.size(0): batch_size.
        output, hidden = self.rnn(x,h0) # hidden: enc_hidden. output: dec_hidden
        c_h = self.attn(hidden,output)
        output = self.dec(c_h)
        return output

In [None]:
# HPs
hidden_size = 70
batch_size = 2 # OOM when batch_size >= 4
lr = 0.0002

# Build and run
try:
    del rnn0
    gc.collect()
except:
    True

rnn_a = RNN_Attn(input_size=vec_size,hidden_size=hidden_size,output_size=y_train.shape[1]).to(device)
optimizer = optim.Adam(rnn_a.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train.to(device),y_train.to(device)),batch_size=batch_size)
rnn_a = train_and_save(rnn_a,'RNN_Attn_v0.pt',
                     loss_fn,optimizer,
                     loader,X_val.to(device),y_val_nn.to(device),y_val)

In [70]:
'''
Actions taken to address problem of assigning all samples to one class:
- Removed outlier samples with extremely high word counts (>100) (worked for Attn. RNN unaffected)
- Resampling (tried for RNN; didn't work)
- Weighted loss (tried for RNN; didn't work)
'''
no_print = True

### BERT

In [None]:
'''
Variables:
- lower case tokeniation
- bert models
- hyperparams
'''

In [None]:
# Init model
batch_size = 32
learning_rate = 1e-5
epochs = 4
max_len1 = 512
seed_val = 28
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

config = BertConfig(
    max_length = max_len1,
    max_position_embeddings = max_len1,
) # Doesn't work?
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True,
                                          config = config
                                         )
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 3
)
model = model.cuda()

In [None]:
# For reference

max_len = 0
for sent in df['edited_string'].to_numpy():
    input_ids = tokenizer.encode(sent,add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

In [None]:
def df_to_torchDS(df):
    input_ids = []
    attention_masks = []
    for row in tqdm(df['edited_string'].to_numpy()):
        encoded_dict = tokenizer.encode_plus(
            row,
            add_special_tokens = True,
            max_length = max_len1,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        if len(encoded_dict['input_ids'][0]) > 512:
            row_input_ids = encoded_dict['input_ids'][0][:512]
            row_attention_mask = encoded_dict['input_ids'][0][:512]
        else:
            row_input_ids = encoded_dict['input_ids'][0]
            row_attention_mask = encoded_dict['input_ids'][0]
        input_ids.append(row_input_ids)
        attention_masks.append(row_attention_mask)
    input_ids = torch.from_numpy(np.array(input_ids))
    attention_masks = torch.from_numpy(np.array(attention_masks))
    labels = torch.tensor(df['label_num'].to_numpy())
    return input_ids, attention_masks, labels

In [None]:
train_idx, val_idx = train_test_split(np.arange(len(df)),test_size=0.2,random_state=28)
input_ids_core, attention_masks_core, labels_core = df_to_torchDS(df_train)
input_ids_test, attention_masks_test, labels_test = df_to_torchDS(df_test)
val_inputs, val_attention, val_labels = input_ids_core[val_idx], attention_masks_core[val_idx], labels_core[val_idx] 
val_labels = val_labels.numpy()
train_data = TensorDataset(input_ids_core[train_idx],attention_masks_core[train_idx],labels_core[train_idx])
test_data = TensorDataset(input_ids_test,attention_masks_test,labels_test)

train_dataloader = DataLoader(
    train_data,
    sampler = RandomSampler(train_data),
    batch_size = batch_size
)

In [None]:
# Pre-training Init
optimizer = torch.optim.Adam(
    model.parameters(),
    lr = learning_rate,
    eps = 1e-8 #epsilon, to prevent division by zero
)

In [None]:
def train_model(model,optimizer,train_dataloader,val_inputs=None,val_attention=None,val_labels=None):
    start_time = time.perf_counter()
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    for epoch in tqdm(range(0,epochs)):
        # Training
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch_input_ids = batch[0].to(device)
            batch_attention_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)
            result = model(
                batch_input_ids,
                token_type_ids = None, #KIV
                attention_mask = batch_attention_masks,
                labels = batch_labels,
                return_dict = True
            )
            loss = result.loss
            logits = result.logits #KIV
            total_train_loss += loss.item() #KIV 
            loss.backward()
            clip_grad_norm_(model.parameters(),1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader) #KIV
        
        # Evaluation
        model.eval()      
        result = model(
                    val_inputs,
                    token_type_ids = None, #KIV
                    attention_mask = val_attention,
                    labels = val_labels,
                    return_dict = True
                )
        val_loss = result.loss.item()
        val_pred = result.logits.detach.cpu().numpy()
        val_pred = np.argmax(val_pred,axis=1).flatten()
        accuracy = accuracy_score(val_labels,val_pred)
        precision,recall,f1,_ = precision_recall_fscore_support(val_labels,val_pred)
        print(f'Epoch:{epoch}, val_loss:{val_loss}, accuracy:{accuracy}, precision:{precision}, recall:{recall}, f1:{f1}')
    new_path = r'C:\YZC\NUS\Semester 2\CS4248 Natural Language Processing\Project\scicite\bert'
    if not os.path.exists(new_path):
        model.save_pretrained(new_path)     

In [None]:
train_model(model,optimizer,train_dataloader,val_inputs=val_inputs,val_attention=val_attention,val_labels=val_labels)

### Archives

In [None]:
'''
pytorch lightning, lightning fabric
'''
no_print=True

In [None]:
#         # Draft 1
#         total_eval_loss = 0
#         for batch in val_dataloader:
#             batch_input_ids = batch[0].to(device)
#             batch_attention_masks = batch[1].to(device)
#             batch_labels = batch[2].to(device)
#             with torch.no_grad():
#                 result = model(
#                     batch_input_ids,
#                     token_type_ids = None, #KIV
#                     attention_mask = batch_attention_masks,
#                     labels = batch_labels,
#                     return_dict = True
#                 )
#             loss = result.loss
#             logits = result.logits #KIV
#             total_val_loss += loss.item()
#             logits = logits.detach().cpu().numpy() #KIV
#             labels = batch_labels.to(device).numpy()
        
#         avg_val_loss = total_val_loss / len(val_dataloader)