In [151]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support,balanced_accuracy_score
from gensim.models import Word2Vec,FastText
from nltk.tokenize import MWETokenizer,word_tokenize
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from tqdm.auto import tqdm
import copy
import os
import time
import warnings

path = r"C:\YZC\NUS\Semester 2\CS4248 Natural Language Processing\Project\scicite"
os.chdir(path)
if torch.cuda.is_available():
    device = 'cuda'
else: device = 'cpu'

In [23]:
df = pd.read_json('train.jsonl',lines=True)
df_test = pd.read_json('test.jsonl',lines=True)
df.head()

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence
0,explicit,175.0,Introduction,168.0,"However, how frataxin interacts with the Fe-S ...",background,1.0,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,,
1,explicit,36.0,Novel Quantitative Trait Loci for Seminal Root...,16.0,"In the study by Hickey et al. (2012), spikes w...",background,1.0,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,,
2,explicit,228.0,Introduction,225.0,"The drug also reduces catecholamine secretion,...",background,1.0,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,,
3,explicit,110.0,Discussion,46.0,By clustering with lowly aggressive close kin ...,background,1.0,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,,
4,explicit,239.0,Discussion,234.0,Ophthalmic symptoms are rare manifestations of...,background,1.0,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,,


In [5]:
'''
Questions:
excerpt_index?
citeStart, citeEnd?
label2 (supportiveness)?
isKeyCitation?
'''
no_print = True

### Preprocessing

In [3]:
def null_test():
    null_cols = []
    for col in df.columns:
        if df[col].isnull().any(): null_cols.append(col)
    print(null_cols)

#### Processing functions

In [61]:
tk = MWETokenizer([('<','bos','>'),('<','eos','>')],separator='')
# tk.add_mwe([('<','bos','>'),('<','eos','>')])

def convert_label(value):
    if value == 'background':
        label = 0
    elif value == 'method':
        label = 1
    elif value == 'result':
        label = 2
    return label

def NA_impute(df):
    df['label2'] = df['label2'].fillna('cant_determine')
    try:
        print(len(df).df.columns)
        df = df.drop(columns='label2_confidence',axis=1)
        df_type = 'train'
    except:
        df_type = 'test'
    df['label_confidence'] = df['label_confidence'].fillna(df['label_confidence'].mean())
    df['citeStart'] = df['citeStart'].fillna(df['citeStart'].mean().astype(np.int64))
    df['citeEnd'] = df['citeEnd'].fillna(df['citeEnd'].mean().astype(np.int64))
    df['source'] = df['source'].fillna('unknown')
    df['sectionName'] = df['sectionName'].fillna('unknown')
    return df, df_type

def process_df(df):
    df, df_type = NA_impute(df)
    for col in ['citeStart','citeEnd']:
        df[col] = df[col].astype('int64')
    feature_cols = ['source', 'citeEnd', 'sectionName', 'citeStart', 'label_confidence', 'citingPaperId', 'citedPaperId', 'isKeyCitation', 'excerpt_index', 'label2', 'label2_confidence']
    if df_type == 'test':
        feature_cols.remove('label2_confidence')
    df['edited_string'] = ''
    for col in feature_cols:
        df['edited_string'] += col + ': ' + df[col].astype(str) + '[SEP]'
    df['edited_string'] += df['string']
    df['tagged_string'] = '<BOS>' + df['string'] + '<EOS>'
    df['label_num'] = df['label'].apply(lambda x: convert_label(x))
    df['string_lower'] = df['tagged_string'].apply(lambda x: x.lower())
    df['tokens_lower'] = df['string_lower'].apply(lambda x: tk.tokenize(word_tokenize(x)))  
    return df

In [62]:
df_train = process_df(df)
df_test = process_df(df_test)

In [63]:
df_train.head()

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence,edited_string,tagged_string,label_num,string_lower,tokens_lower
0,explicit,175,Introduction,168,"However, how frataxin interacts with the Fe-S ...",background,1.0,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,cant_determine,,source: explicit[SEP]citeEnd: 175[SEP]sectionN...,"<BOS>However, how frataxin interacts with the ...",0,"<bos>however, how frataxin interacts with the ...","[<bos>, however, ,, how, frataxin, interacts, ..."
1,explicit,36,Novel Quantitative Trait Loci for Seminal Root...,16,"In the study by Hickey et al. (2012), spikes w...",background,1.0,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,cant_determine,,source: explicit[SEP]citeEnd: 36[SEP]sectionNa...,"<BOS>In the study by Hickey et al. (2012), spi...",0,"<bos>in the study by hickey et al. (2012), spi...","[<bos>, in, the, study, by, hickey, et, al, .,..."
2,explicit,228,Introduction,225,"The drug also reduces catecholamine secretion,...",background,1.0,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,cant_determine,,source: explicit[SEP]citeEnd: 228[SEP]sectionN...,<BOS>The drug also reduces catecholamine secre...,0,<bos>the drug also reduces catecholamine secre...,"[<bos>, the, drug, also, reduces, catecholamin..."
3,explicit,110,Discussion,46,By clustering with lowly aggressive close kin ...,background,1.0,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,cant_determine,,source: explicit[SEP]citeEnd: 110[SEP]sectionN...,<BOS>By clustering with lowly aggressive close...,0,<bos>by clustering with lowly aggressive close...,"[<bos>, by, clustering, with, lowly, aggressiv..."
4,explicit,239,Discussion,234,Ophthalmic symptoms are rare manifestations of...,background,1.0,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,cant_determine,,source: explicit[SEP]citeEnd: 239[SEP]sectionN...,<BOS>Ophthalmic symptoms are rare manifestatio...,0,<bos>ophthalmic symptoms are rare manifestatio...,"[<bos>, ophthalmic, symptoms, are, rare, manif..."


### RNN

In [None]:
'''
Hypotheses:
Word embeddings
Combine word embeddings with features
Attention mechanism

Open questions
- Where can we get more contextual information? How to incorporate?
- variables:
    - lower case? stopwords? lemmatization? any words to include?
'''
no_print = True

In [None]:
'''
Generate word_embeddings (ok)
Convert labels (ok)
Generate other features?
'''

#### Preparing Arrays

In [136]:
'''
input array (batch_first = True): N (batch size) * L (sequence length) * H_in (input size)
'''
def generate_X(df,token_col,model,seq_len,vec_size):
    '''
    token_col: col for tokens in dataframe
    model: word vectorization model
    '''
    X = []
    for i in tqdm(range(len(df))):
        i_arr = [model.wv[token] for token in df.at[i,token_col]]
        if len(i_arr) < seq_len:
            while len(i_arr) < seq_len:
                i_arr.append(np.zeros(vec_size))
        X.append(i_arr)
    return np.array(X)

def convert_y(y): # for training and validation labels. For y_val, keep one multi-target and one single-target version
    y_nn = np.zeros([y.shape[0],3])
    for i in range(len(y)):
        if y[i] == 0:
            y_nn[i,0] = 1
        elif y[i] == 1:
            y_nn[i,1] = 1
        elif y[i] == 2: 
            y_nn[i,2] = 1
    return y_nn

def to_tensor(arr,dtype='Float'):
    if dtype == "Float":
        arr = torch.from_numpy(arr).type(torch.FloatTensor)
    elif dtype == "Long":
        arr = torch.from_numpy(arr).type(torch.LongTensor)
    return arr

In [None]:
corpus = df_train['tokens_lower'].tolist()
vec_size = 100 # hp
seq_length = 591 # max word length based on tokenization, including padding tokens
ft = FastText(corpus,vector_size=vec_size,epochs=10)

In [143]:
X_core = generate_X(df_train,'tokens_lower',
                   ft,seq_length,vec_size)
X_test = generate_X(df_test,'tokens_lower',
                   ft,seq_length,vec_size)
y_core = df_train['label_num'].to_numpy()
X_train, X_val, y_train, y_val = train_test_split(X_core,y_core,test_size=0.2,random_state= 1)
y_train, y_val_nn = convert_y(y_train),convert_y(y_val)
X_train, X_val, X_test, y_train, y_val_nn = to_tensor(X_train), to_tensor(X_val), to_tensor(X_test), to_tensor(y_train), to_tensor(y_val_nn)

  0%|          | 0/8243 [00:00<?, ?it/s]

  0%|          | 0/1861 [00:00<?, ?it/s]

In [152]:
def check_dist(arr):
    unique, counts = np.unique(arr,return_counts=True)
    print(list(zip(unique,counts)))

def train_and_save(model,filename,
                     loss_fn,optimizer,
                     dataloader,X_val,y_val_nn,y_val,n_epochs=40):
    best_loss = float('inf')
    for epoch in tqdm(range(n_epochs)):
        model.train()
        for X_batch, y_batch in loader:
            y_pred_batch = model(X_batch)
            loss = loss_fn(y_pred_batch,y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        model.eval()
        with torch.no_grad():
            y_pred_nn = model(X_val)
            y_pred = torch.argmax(y_pred_nn,1)
            check_dist (y_pred) # Troubleshooting
            bal_accuracy = balanced_accuracy_score(y_val,y_pred)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                precision, recall, fscore, support = precision_recall_fscore_support(y_val,y_pred,average='weighted')
            val_loss = loss_fn(y_pred_nn,y_val_nn)
            tqdm.write(f"Epoch{epoch+1}: val loss={val_loss}, balanced accuracy={bal_accuracy}, precision={precision}, recall={recall}, fscore={fscore}")
            
            # Early Stopping
            if val_loss < best_loss:
                best_loss = val_loss
                best_model_weights = copy.deepcopy(model.state_dict())
                patience = 10
            else:
                patience -= 1
                if patience == 0:
                    break
    torch.save(best_model_weights,file_name)
    return model

#### Naive RNN

In [149]:
# See documentation: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

class RNN_base(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN_base,self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size,hidden_size,batch_first=True)
        self.ff = nn.Linear(hidden_size,output_size)
    
    def forward(self,x):
        h0 = torch.zeros(1,x.size(0),self.hidden_size)
        # 1: number of RNN layers. x.size(0): batch_size.
        _, hidden = self.rnn(x,h0)
        output = self.ff(hidden[-1])
        return output

In [150]:
# HPs
hidden_size = 80
batch_size = 2
lr = 0.004

## Note:
    # Arrays should be tensors
    # Do not unsqueeze arrays

# Build and run
rnn0 = RNN_base(input_size=vec_size,hidden_size = 80,output_size=y_train.shape[1])
optimizer = optim.Adam(rnn0.parameters(),lr=0.004)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(TensorDataset(X_train,y_train),batch_size=2)
rnn0 = train_and_save(rnn0,'RNN_v0.pt',
                     loss_fn,optimizer,
                     loader,X_val,y_val_nn,y_val)

  0%|          | 0/40 [00:00<?, ?it/s]

[(0, 1281), (1, 368)]
Epoch1: val loss=1.0422667264938354, balanced accuracy=0.34137852149708053, precision=0.41522012983280027, recall=0.5154639175257731, fscore=0.45561506654752876


  _warn_prf(average, modifier, msg_start, len(result))


[(0, 1463), (1, 186)]
Epoch2: val loss=1.042863130569458, balanced accuracy=0.3320060308204403, precision=0.40595627745786156, recall=0.5372953305033353, fscore=0.44248688213572945


  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

#### RNN with Attention

### BERT

In [11]:
'''
Variables:
- lower case tokeniation
- bert models
- hyperparams
'''

'\nVariables:\n- lower case tokeniation\n- bert models\n- hyperparams\n'

In [41]:
# Init model
batch_size = 32
learning_rate = 1e-5
epochs = 4
max_len1 = 512
seed_val = 28
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

config = BertConfig(
    max_length = max_len1,
    max_position_embeddings = max_len1,
) # Doesn't work?
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True,
                                          config = config
                                         )
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 3
)
model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# For reference

max_len = 0
for sent in df['edited_string'].to_numpy():
    input_ids = tokenizer.encode(sent,add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

Token indices sequence length is longer than the specified maximum sequence length for this model (648 > 512). Running this sequence through the model will result in indexing errors


In [48]:
def df_to_torchDS(df):
    input_ids = []
    attention_masks = []
    for row in tqdm(df['edited_string'].to_numpy()):
        encoded_dict = tokenizer.encode_plus(
            row,
            add_special_tokens = True,
            max_length = max_len1,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        if len(encoded_dict['input_ids'][0]) > 512:
            row_input_ids = encoded_dict['input_ids'][0][:512]
            row_attention_mask = encoded_dict['input_ids'][0][:512]
        else:
            row_input_ids = encoded_dict['input_ids'][0]
            row_attention_mask = encoded_dict['input_ids'][0]
        input_ids.append(row_input_ids)
        attention_masks.append(row_attention_mask)
    input_ids = torch.from_numpy(np.array(input_ids))
    attention_masks = torch.from_numpy(np.array(attention_masks))
    labels = torch.tensor(df['label_num'].to_numpy())
    return input_ids, attention_masks, labels

In [49]:
train_idx, val_idx = train_test_split(np.arange(len(df)),test_size=0.2,random_state=28)
input_ids_core, attention_masks_core, labels_core = df_to_torchDS(df_train)
input_ids_test, attention_masks_test, labels_test = df_to_torchDS(df_test)
val_inputs, val_attention, val_labels = input_ids_core[val_idx], attention_masks_core[val_idx], labels_core[val_idx] 
val_labels = val_labels.numpy()
train_data = TensorDataset(input_ids_core[train_idx],attention_masks_core[train_idx],labels_core[train_idx])
test_data = TensorDataset(input_ids_test,attention_masks_test,labels_test)

train_dataloader = DataLoader(
    train_data,
    sampler = RandomSampler(train_data),
    batch_size = batch_size
)

  0%|          | 0/8243 [00:00<?, ?it/s]

  0%|          | 0/1861 [00:00<?, ?it/s]

In [21]:
# Pre-training Init
optimizer = torch.optim.Adam(
    model.parameters(),
    lr = learning_rate,
    eps = 1e-8 #epsilon, to prevent division by zero
)



In [51]:
def train_model(model,optimizer,train_dataloader,val_inputs=None,val_attention=None,val_labels=None):
    start_time = time.perf_counter()
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    for epoch in tqdm(range(0,epochs)):
        # Training
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch_input_ids = batch[0].to(device)
            batch_attention_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)
            result = model(
                batch_input_ids,
                token_type_ids = None, #KIV
                attention_mask = batch_attention_masks,
                labels = batch_labels,
                return_dict = True
            )
            loss = result.loss
            logits = result.logits #KIV
            total_train_loss += loss.item() #KIV 
            loss.backward()
            clip_grad_norm_(model.parameters(),1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader) #KIV
        
        # Evaluation
        model.eval()      
        result = model(
                    val_inputs,
                    token_type_ids = None, #KIV
                    attention_mask = val_attention,
                    labels = val_labels,
                    return_dict = True
                )
        val_loss = result.loss.item()
        val_pred = result.logits.detach.cpu().numpy()
        val_pred = np.argmax(val_pred,axis=1).flatten()
        accuracy = accuracy_score(val_labels,val_pred)
        precision,recall,f1,_ = precision_recall_fscore_support(val_labels,val_pred)
        print(f'Epoch:{epoch}, val_loss:{val_loss}, accuracy:{accuracy}, precision:{precision}, recall:{recall}, f1:{f1}')
    new_path = r'C:\YZC\NUS\Semester 2\CS4248 Natural Language Processing\Project\scicite\bert'
    if not os.path.exists(new_path):
        model.save_pretrained(new_path)     

In [52]:
train_model(model,optimizer,train_dataloader,val_inputs=val_inputs,val_attention=val_attention,val_labels=val_labels)

  0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.47 GiB is allocated by PyTorch, and 100.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Archives

In [26]:
#         # Draft 1
#         total_eval_loss = 0
#         for batch in val_dataloader:
#             batch_input_ids = batch[0].to(device)
#             batch_attention_masks = batch[1].to(device)
#             batch_labels = batch[2].to(device)
#             with torch.no_grad():
#                 result = model(
#                     batch_input_ids,
#                     token_type_ids = None, #KIV
#                     attention_mask = batch_attention_masks,
#                     labels = batch_labels,
#                     return_dict = True
#                 )
#             loss = result.loss
#             logits = result.logits #KIV
#             total_val_loss += loss.item()
#             logits = logits.detach().cpu().numpy() #KIV
#             labels = batch_labels.to(device).numpy()
        
#         avg_val_loss = total_val_loss / len(val_dataloader)