In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.nn.utils import clip_grad_norm_
import transformers
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification
from tqdm.auto import tqdm
import os
import time

if torch.cuda.is_available():
    device = 'cuda'
else: device = 'cpu'

In [2]:
df = pd.read_json('/home/jupyter/CS4248-Project/data/scicite/train.jsonl',lines=True)
df_test = pd.read_json('/home/jupyter/CS4248-Project/data/scicite/test.jsonl',lines=True)
df.head()

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence
0,explicit,175.0,Introduction,168.0,"However, how frataxin interacts with the Fe-S ...",background,1.0,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,,
1,explicit,36.0,Novel Quantitative Trait Loci for Seminal Root...,16.0,"In the study by Hickey et al. (2012), spikes w...",background,1.0,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,,
2,explicit,228.0,Introduction,225.0,"The drug also reduces catecholamine secretion,...",background,1.0,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,,
3,explicit,110.0,Discussion,46.0,By clustering with lowly aggressive close kin ...,background,1.0,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,,
4,explicit,239.0,Discussion,234.0,Ophthalmic symptoms are rare manifestations of...,background,1.0,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,,


In [3]:
'''
Questions:
excerpt_index?
citeStart, citeEnd?
label2 (supportiveness)?
isKeyCitation?
'''
no_print = True

### Preprocessing

In [4]:
def null_test():
    null_cols = []
    for col in df.columns:
        if df[col].isnull().any(): null_cols.append(col)
    print(null_cols)

#### Processing functions

In [5]:
def convert_label(value):
    if value == 'background':
        label = 0
    elif value == 'method':
        label = 1
    elif value == 'result':
        label = 2
    return label

def NA_impute(df):
    df['label2'] = df['label2'].fillna('cant_determine')
    try:
        print(len(df).df.columns)
        df = df.drop(columns='label2_confidence',axis=1)
        df_type = 'train'
    except:
        df_type = 'test'
    df['label_confidence'] = df['label_confidence'].fillna(df['label_confidence'].mean())
    df['citeStart'] = df['citeStart'].fillna(df['citeStart'].mean().astype(np.int64))
    df['citeEnd'] = df['citeEnd'].fillna(df['citeEnd'].mean().astype(np.int64))
    df['source'] = df['source'].fillna('unknown')
    df['sectionName'] = df['sectionName'].fillna('unknown')
    return df, df_type

def process_data(df):
    df, df_type = NA_impute(df)
    for col in ['citeStart','citeEnd']:
        df[col] = df[col].astype('int64')
    feature_cols = ['source', 'citeEnd', 'sectionName', 'citeStart', 'label_confidence', 'citingPaperId', 'citedPaperId', 'isKeyCitation', 'excerpt_index', 'label2', 'label2_confidence']
    if df_type == 'test':
        feature_cols.remove('label2_confidence')
    df['edited_string'] = ''
    for col in feature_cols:
        df['edited_string'] += col + ': ' + df[col].astype(str) + '[SEP]'
    df['edited_string'] += df['string']
    df['label_num'] = df['label'].apply(lambda x: convert_label(x))
    return df

In [6]:
df_train = process_data(df)
df_test = process_data(df_test)

### Model

In [7]:
'''
Variables:
- lower case tokeniation
- bert models
- hyperparams
'''

'\nVariables:\n- lower case tokeniation\n- bert models\n- hyperparams\n'

In [8]:
# Init model
batch_size = 16
learning_rate = 1e-5
epochs = 4
max_len1 = 512
seed_val = 28
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

config = BertConfig(
    max_length = max_len1,
    max_position_embeddings = max_len1,
) # Doesn't work?
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True,
                                          config = config
                                         )
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 3
)
model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# # For reference

# max_len = 0
# for sent in df['edited_string'].to_numpy():
#     input_ids = tokenizer.encode(sent,add_special_tokens=True)
#     max_len = max(max_len, len(input_ids))

In [10]:
def df_to_torchDS(df):
    input_ids = []
    attention_masks = []
    for row in tqdm(df['edited_string'].to_numpy()):
        encoded_dict = tokenizer.encode_plus(
            row,
            add_special_tokens = True,
            max_length = max_len1,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        if len(encoded_dict['input_ids'][0]) > 512:
            row_input_ids = encoded_dict['input_ids'][0][:512]
            row_attention_mask = encoded_dict['input_ids'][0][:512]
        else:
            row_input_ids = encoded_dict['input_ids'][0]
            row_attention_mask = encoded_dict['input_ids'][0]
        input_ids.append(row_input_ids)
        attention_masks.append(row_attention_mask)
    input_ids = torch.from_numpy(np.array(input_ids))
    attention_masks = torch.from_numpy(np.array(attention_masks))
    labels = torch.tensor(df['label_num'].to_numpy())
    return input_ids, attention_masks, labels

In [11]:
train_idx, val_idx = train_test_split(np.arange(len(df)),test_size=0.2,random_state=28)
input_ids_core, attention_masks_core, labels_core = df_to_torchDS(df_train)
input_ids_test, attention_masks_test, labels_test = df_to_torchDS(df_test)
val_inputs, val_attention, val_labels = input_ids_core[val_idx], attention_masks_core[val_idx], labels_core[val_idx] 
val_labels = val_labels.numpy()
train_data = TensorDataset(input_ids_core[train_idx],attention_masks_core[train_idx],labels_core[train_idx])
test_data = TensorDataset(input_ids_test,attention_masks_test,labels_test)

train_dataloader = DataLoader(
    train_data,
    sampler = RandomSampler(train_data),
    batch_size = batch_size
)

  0%|          | 0/8243 [00:00<?, ?it/s]

  0%|          | 0/1861 [00:00<?, ?it/s]

In [12]:
# Pre-training Init
optimizer = torch.optim.Adam(
    model.parameters(),
    lr = learning_rate,
    eps = 1e-8 #epsilon, to prevent division by zero
)

In [14]:
def train_model(model,optimizer,train_dataloader,val_inputs=None,val_attention=None,val_labels=None):
    start_time = time.perf_counter()
    total_steps = len(train_dataloader) * epochs
    # scheduler = transformers.get_linear_schedule_with_warmup(
    #     optimizer,
    #     num_warmup_steps=0,
    #     num_training_steps=total_steps
    # )
    for epoch in tqdm(range(0,epochs)):
        # Training
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch_input_ids = batch[0].to(device)
            batch_attention_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)
            result = model(
                batch_input_ids,
                token_type_ids = None, #KIV
                attention_mask = batch_attention_masks,
                labels = batch_labels,
                return_dict = True
            )
            loss = result.loss
            logits = result.logits #KIV
            loss.backward()
            clip_grad_norm_(model.parameters(),1.0)
            optimizer.step()
            # scheduler.step()
            if step % 100 == 0:
                print(f'Step {step}, time elapsed:{time.perf_counter()-start_time}')
        
        # Evaluation
        model.eval()      
        result = model(
                    val_inputs.to(device),
                    token_type_ids = None, #KIV
                    attention_mask = val_attention.to(device),
                    labels = val_labels,
                    return_dict = True
                )
        val_loss = result.loss.item()
        val_pred = result.logits.detach.cpu().numpy()
        val_pred = np.argmax(val_pred,axis=1).flatten()
        accuracy = accuracy_score(val_labels,val_pred)
        precision,recall,f1,_ = precision_recall_fscore_support(val_labels,val_pred)
        print(f'Epoch:{epoch}, val_loss:{val_loss}, accuracy:{accuracy}, precision:{precision}, recall:{recall}, f1:{f1}')
    new_path = r'/home/jupyter/CS4248-Project/trained_models'
    if not os.path.exists(new_path):
        model.save_pretrained(new_path)     

In [15]:
train_model(model,optimizer,train_dataloader,val_inputs=val_inputs,val_attention=val_attention,val_labels=val_labels)

  0%|          | 0/4 [00:00<?, ?it/s]

Step 0, time elapsed:1.7209734150001168
Step 100, time elapsed:125.8626514790003


KeyboardInterrupt: 

### Archives

In [26]:
#         # Draft 1
#         total_eval_loss = 0
#         for batch in val_dataloader:
#             batch_input_ids = batch[0].to(device)
#             batch_attention_masks = batch[1].to(device)
#             batch_labels = batch[2].to(device)
#             with torch.no_grad():
#                 result = model(
#                     batch_input_ids,
#                     token_type_ids = None, #KIV
#                     attention_mask = batch_attention_masks,
#                     labels = batch_labels,
#                     return_dict = True
#                 )
#             loss = result.loss
#             logits = result.logits #KIV
#             total_val_loss += loss.item()
#             logits = logits.detach().cpu().numpy() #KIV
#             labels = batch_labels.to(device).numpy()
        
#         avg_val_loss = total_val_loss / len(val_dataloader)