https://colab.research.google.com/drive/1Y4o3jh3ZH70tl6mCd76vz_IxX23biCPP#scrollTo=JhUZO9vc_l6T

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!pip install json-lines
!pip install transformers

%tensorflow_version 1.x

In [0]:
#@title Config
"""
Configuration file for the project.
"""

"""
Base directory.
"""
PWD = '/content/drive/My Drive/KY, FYP/Code/'

"""
File directories.
"""
# Directory for the word embeddings
GLOVE_DIR = PWD + '/glove.6B'

# Directory for storing citation function data
DATA_DIR = PWD + '/data/data'

"""
Data files: the citation and provenance dataset.
MTL refers to the aligned dataset.
"""
DATA_FILES = {
    'func': {
        'golden_train': 'processed/golden_train.func.json',
        'golden_test': 'processed/golden_test.func.json',
    },
    'scicite': {
        'train': 'scicite/train.jsonl',
        'test': 'scicite/test.jsonl',
        'dev': 'scicite/dev.jsonl'
    },
    'acl-arc': {
        'train': 'acl-arc/train.jsonl',
        'test': 'acl-arc/test.jsonl',
        'dev': 'acl-arc/dev.jsonl'
    },
    'prov': {
        'golden_train': 'processed/golden_train.prov.json',
        'golden_test': 'processed/golden_test.prov.json',
    },
    'mtl': {
        'golden_train': 'processed/golden_train.mtl.json',
        'golden_test': 'processed/golden_test.mtl.json'
    }
}


In [0]:
#@title Data
"""
Common data operations.
"""
import json
import os
import json_lines

import numpy as np

def read_json_data(filename):
    """
    Read the given JSON file.
    """
    path = os.path.join(DATA_DIR, filename)
    with open(path, 'rb') as fp:
        content = json.load(fp)
        return content

def read_jsonl_data(filename):
    """
    Read the given JSONL file.
    """
    path = os.path.join(DATA_DIR, filename)
    content = []
    print (type(content))
    with open(path, 'rb') as fp:
        for item in json_lines.reader(fp):
            content.append(item)
        return content

"""
Custom cross validation.
"""


def compress_y(ys):
    """
    For each y in ys, if y is of the form [0 0 ... 1 ... 0], compress it to a
    single integer.
    """
    if len(ys) < 1:
        return ys

    if isinstance(ys[0], np.ndarray):
        # A hack >.<
        return map(lambda x: x.tolist().index(1), ys)
    else:
        return ys


In [0]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [0]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P4


In [0]:
#@title ACL dataset (4 Classes) Initialization

directory = DATA_DIR
funcs_index = {'Neut': 3, 'Pos': 2, 'CoCo': 1, 'Weak': 0}


# Function dataset start
datafiles = DATA_FILES['func']
test = read_json_data(datafiles['golden_test'])
# train = data.read_jsonl_data((datafiles['train']))
train = read_json_data(datafiles['golden_train'])

dataset_train = list(filter(lambda x: x['label'] != 'Error',train))
dataset_test = list(filter(lambda x: x['label'] != 'Error', test))

dataset = dataset_train + dataset_test

texts = list(map(lambda d: d['context'][0], dataset))

ys = list(map(lambda d: funcs_index[d['label']], dataset))

seed = 2
np.random.seed(seed)
# tf.python.control_flow_ops = tf
tf.compat.v1.set_random_seed(seed)

from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sample in texts:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sample,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

print('Max sentence length: ', max([len(sen) for sen in input_ids]))

# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 47...
MAX_LEN = 200

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

print('\nDone.')

# Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split

# Split into train(80%) and test(20%) sets
x_train, x_test , y_train, y_test = train_test_split(input_ids, ys, 
                                                random_state=seed, test_size=0.2)
# Do the same for the masks.
train_masks, test_masks, _, _ = train_test_split(attention_masks, ys,
                                             random_state=seed, test_size=0.2)



# Futher split train data into train(80%) and validation(20%) sets
train_masks, val_masks , _, _ = train_test_split(train_masks, x_train,
                                                   random_state=seed, test_size=0.2)

x_train, x_val , y_train, y_val = train_test_split(x_train, y_train,
                                                   random_state=seed, test_size=0.2)

# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
x_train = torch.tensor(x_train)
x_test = torch.tensor(x_test)
x_val = torch.tensor(x_val)

y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)
y_val = torch.tensor(y_val)

train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)
val_masks = torch.tensor(val_masks)

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(x_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(x_val, val_masks, y_val)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [0]:
#@title Model Structure

from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
# model = BertForSequenceClassification.from_pretrained(
#     "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
#     num_labels = 4, # The number of output labels--2 for binary classification.
#                     # You can increase this for multi-class tasks.   
#     output_attentions = False, # Whether the model returns attentions weights.
#     output_hidden_states = False, # Whether the model returns all hidden-states.
# )

#### Import from scicite(3 Classes) ####
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

transfered_model = PWD + 'Sciicite.pt'
model.load_state_dict(torch.load(transfered_model))
# Tell pytorch to run this model on the GPU.
# model.cuda()

In [0]:
#@title Training + Validation

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

from transformers import get_linear_schedule_with_warmup
import torch.nn as nn

criterion = nn.CrossEntropyLoss()

# Number of training epochs (authors recommend between 2 and 4)
epochs =  4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

import numpy as np
import time
import datetime

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 40

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask)
                    # labels=b_labels)

        

        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        # loss = outputs[0]
        loss = criterion(outputs[0], b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")



In [0]:
#@title Test
# Create the DataLoader for our validation set.
test_data = TensorDataset(x_test, test_masks, y_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


# ========================================
#               Testing
# ========================================
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(x_test)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('DONE.')

y_test = y_test.tolist()

y_pred = []
for i in predictions:
  y_pred += i.tolist()

import pandas as pd

y_pred = list(map(lambda x: pd.Series(x).idxmax(), y_pred))

import sklearn.metrics as metrics

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits=4))

In [0]:
torch.save(model.state_dict(), 'Model.pt')

###################################### END ##################################

In [0]:
model2 = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'

model2.load_state_dict(torch.load(path))

In [0]:
model2.classifier = torch.nn.Linear(768,6)

In [0]:
model3


In [0]:
model2

In [0]:
#@title SciCite

# Function dataset start
funcs_index = {'background': 0, 'method': 1, 'result': 2}

# Function dataset start
datafiles = DATA_FILES['scicite']
test = read_jsonl_data(datafiles['test'])
train = read_jsonl_data(datafiles['train'])

dataset_train = list(filter(lambda x: x['label'] != 'Error',train))
dataset_test = list(filter(lambda x: x['label'] != 'Error', test))

dataset = dataset_train + dataset_test

texts = list(map(lambda d: d['string'], dataset))

ys = list(map(lambda d: funcs_index[d['label']], dataset))

seed = 3
np.random.seed(seed)
# tf.python.control_flow_ops = tf
tf.compat.v1.set_random_seed(seed)


from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sample in texts:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sample,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

# Filter out the instances with >512 tokens
for i, sample in enumerate(input_ids):
  if len(sample) > 511:
    texts.pop(i)
    input_ids.pop(i)
    ys.pop(i)
    print("removed Index: " +  str(i))

print('Max sentence length: ', max([len(sen) for sen in input_ids]))

# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 47...
MAX_LEN = 500

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

print('\nDone.')

# Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation.
x_train, x_test , y_train, y_test = train_test_split(input_ids, ys, 
                                                random_state=seed, test_size=0.2)
# Do the same for the masks.
train_masks, test_masks, _, _ = train_test_split(attention_masks, ys,
                                             random_state=seed, test_size=0.2)

train_masks, val_masks , _, _ = train_test_split(train_masks, x_train,
                                                   random_state=seed, test_size=0.2)

x_train, x_val , y_train, y_val = train_test_split(x_train, y_train,
                                                   random_state=seed, test_size=0.2)


# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
x_train = torch.tensor(x_train)
x_test = torch.tensor(x_test)
x_val = torch.tensor(x_val)

y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)
y_val = torch.tensor(y_val)

train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)
val_masks = torch.tensor(val_masks)


from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(x_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(x_val, val_masks, y_val)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs =  0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

import numpy as np
import time
import datetime

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 40

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    tf.reset_default_graph()

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

# Create the DataLoader for our validation set.
test_data = TensorDataset(x_test, test_masks, y_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(x_test)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('DONE.')

y_test = y_test.tolist()

y_pred = []
for i in predictions:
  y_pred += i.tolist()

import pandas as pd

y_pred = list(map(lambda x: pd.Series(x).idxmax(), y_pred))

import sklearn.metrics as metrics

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits=4))

In [0]:
#@title Method for Training scicite scicite(seed,filepath,text_path)

def scicite(seed,filepath,text_path):

  import numpy as np
  import time
  import datetime
  import random
  
  from transformers import BertTokenizer


  funcs_index = {'background': 0, 'method': 1, 'result': 2}

  # Function dataset start
  datafiles = DATA_FILES['scicite']
  test = read_jsonl_data(datafiles['test'])
  train = read_jsonl_data(datafiles['train'])

  dataset_train = list(filter(lambda x: x['label'] != 'Error',train))
  dataset_test = list(filter(lambda x: x['label'] != 'Error', test))

  dataset = dataset_train + dataset_test

  texts = list(map(lambda d: d['string'], dataset))

  ys = list(map(lambda d: funcs_index[d['label']], dataset))

  # Load the BERT tokenizer.
  print('Loading BERT tokenizer...')
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []

  # For every sentence...
  for sample in texts:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sample,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                          # This function also supports truncation and conversion
                          # to pytorch tensors, but we need to do padding, so we
                          # can't use these features :( .
                          #max_length = 128,          # Truncate all sentences.
                          #return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      # Add the encoded sentence to the list.
      input_ids.append(encoded_sent)

  # Print sentence 0, now as a list of IDs.
  print('Original: ', texts[0])
  print('Token IDs:', input_ids[0])

  # Filter out the instances with >512 tokens
  for i, sample in enumerate(input_ids):
    if len(sample) > 511:
      texts.pop(i)
      input_ids.pop(i)
      ys.pop(i)
      print("removed Index: " +  str(i))

  print('Max sentence length: ', max([len(sen) for sen in input_ids]))

  # We'll borrow the `pad_sequences` utility function to do this.
  from keras.preprocessing.sequence import pad_sequences

  # Set the maximum sequence length.
  # I've chosen 64 somewhat arbitrarily. It's slightly larger than the
  # maximum training sentence length of 47...
  MAX_LEN = 500

  print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

  print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

  # Pad our input tokens with value 0.
  # "post" indicates that we want to pad and truncate at the end of the sequence,
  # as opposed to the beginning.
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")

  print('\nDone.')

  # Create attention masks
  attention_masks = []

  # For each sentence...
  for sent in input_ids:
      
      # Create the attention mask.
      #   - If a token ID is 0, then it's padding, set the mask to 0.
      #   - If a token ID is > 0, then it's a real token, set the mask to 1.
      att_mask = [int(token_id > 0) for token_id in sent]
      
      # Store the attention mask for this sentence.
      attention_masks.append(att_mask)

  # Use train_test_split to split our data into train and validation sets for
  # training
  from sklearn.model_selection import train_test_split

  # Use 90% for training and 10% for validation.
  x_train, x_test , y_train, y_test = train_test_split(input_ids, ys, 
                                                  random_state=seed, test_size=0.2)
  # Do the same for the masks.
  train_masks, test_masks, _, _ = train_test_split(attention_masks, ys,
                                              random_state=seed, test_size=0.2)

  train_masks, val_masks , _, _ = train_test_split(train_masks, x_train,
                                                    random_state=seed, test_size=0.2)

  x_train, x_val , y_train, y_val = train_test_split(x_train, y_train,
                                                    random_state=seed, test_size=0.2)


  # Convert all inputs and labels into torch tensors, the required datatype 
  # for our model.
  x_train = torch.tensor(x_train)
  x_test = torch.tensor(x_test)
  x_val = torch.tensor(x_val)

  y_train = torch.tensor(y_train)
  y_test = torch.tensor(y_test)
  y_val = torch.tensor(y_val)

  train_masks = torch.tensor(train_masks)
  test_masks = torch.tensor(test_masks)
  val_masks = torch.tensor(val_masks)


  from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

  # The DataLoader needs to know our batch size for training, so we specify it 
  # here.
  # For fine-tuning BERT on a specific task, the authors recommend a batch size of
  # 16 or 32.

  batch_size = 16

  # Create the DataLoader for our training set.
  train_data = TensorDataset(x_train, train_masks, y_train)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  # Create the DataLoader for our validation set.
  validation_data = TensorDataset(x_val, val_masks, y_val)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  from transformers import BertForSequenceClassification, AdamW, BertConfig

  # Load BertForSequenceClassification, the pretrained BERT model with a single 
  # linear classification layer on top. 
  model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = 3, # The number of output labels--2 for binary classification.
                      # You can increase this for multi-class tasks.   
      output_attentions = False, # Whether the model returns attentions weights.
      output_hidden_states = False, # Whether the model returns all hidden-states.
  )

  # Tell pytorch to run this model on the GPU.
  model.cuda()

  optimizer = AdamW(model.parameters(),
                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                  )

  from transformers import get_linear_schedule_with_warmup

  # Number of training epochs (authors recommend between 2 and 4)
  epochs =  2

  # Total number of training steps is number of batches * number of epochs.
  total_steps = len(train_dataloader) * epochs

  # Create the learning rate scheduler.
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0, # Default value in run_glue.py
                                              num_training_steps = total_steps)



  def flat_accuracy(preds, labels):
      pred_flat = np.argmax(preds, axis=1).flatten()
      labels_flat = labels.flatten()
      return np.sum(pred_flat == labels_flat) / len(labels_flat)
      
  def format_time(elapsed):
      '''
      Takes a time in seconds and returns a string hh:mm:ss
      '''
      # Round to the nearest second.
      elapsed_rounded = int(round((elapsed)))
      
      # Format as hh:mm:ss
      return str(datetime.timedelta(seconds=elapsed_rounded))

  import random

  # This training code is based on the `run_glue.py` script here:
  # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


  # Set the seed value all over the place to make this reproducible.
  seed_val = seed

  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)

  # Store the average loss after each epoch so we can plot them.
  loss_values = []

  # For each epoch...
  for epoch_i in range(0, epochs):
      
      # ========================================
      #               Training
      # ========================================
      
      # Perform one full pass over the training set.

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')
      tf.reset_default_graph()

      # Measure how long the training epoch takes.
      t0 = time.time()

      # Reset the total loss for this epoch.
      total_loss = 0

      # Put the model into training mode. Don't be mislead--the call to 
      # `train` just changes the *mode*, it doesn't *perform* the training.
      # `dropout` and `batchnorm` layers behave differently during training
      # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
      model.train()

      # For each batch of training data...
      for step, batch in enumerate(train_dataloader):

          # Progress update every 40 batches.
          if step % 40 == 0 and not step == 0:
              # Calculate elapsed time in minutes.
              elapsed = format_time(time.time() - t0)
              
              # Report progress.
              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

          # Unpack this training batch from our dataloader. 
          #
          # As we unpack the batch, we'll also copy each tensor to the GPU using the 
          # `to` method.
          #
          # `batch` contains three pytorch tensors:
          #   [0]: input ids 
          #   [1]: attention masks
          #   [2]: labels 
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          # Always clear any previously calculated gradients before performing a
          # backward pass. PyTorch doesn't do this automatically because 
          # accumulating the gradients is "convenient while training RNNs". 
          # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
          model.zero_grad()        

          # Perform a forward pass (evaluate the model on this training batch).
          # This will return the loss (rather than the model output) because we
          # have provided the `labels`.
          # The documentation for this `model` function is here: 
          # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
          outputs = model(b_input_ids, 
                      token_type_ids=None, 
                      attention_mask=b_input_mask, 
                      labels=b_labels)
          
          # The call to `model` always returns a tuple, so we need to pull the 
          # loss value out of the tuple.
          loss = outputs[0]

          # Accumulate the training loss over all of the batches so that we can
          # calculate the average loss at the end. `loss` is a Tensor containing a
          # single value; the `.item()` function just returns the Python value 
          # from the tensor.
          total_loss += loss.item()

          # Perform a backward pass to calculate the gradients.
          loss.backward()

          # Clip the norm of the gradients to 1.0.
          # This is to help prevent the "exploding gradients" problem.
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          # Update parameters and take a step using the computed gradient.
          # The optimizer dictates the "update rule"--how the parameters are
          # modified based on their gradients, the learning rate, etc.
          optimizer.step()

          # Update the learning rate.
          scheduler.step()

      # Calculate the average loss over the training data.
      avg_train_loss = total_loss / len(train_dataloader)            
      
      # Store the loss value for plotting the learning curve.
      loss_values.append(avg_train_loss)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
          
      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      print("")
      print("Running Validation...")

      t0 = time.time()

      # Put the model in evaluation mode--the dropout layers behave differently
      # during evaluation.
      model.eval()

      # Tracking variables 
      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      # Evaluate data for one epoch
      for batch in validation_dataloader:
          
          # Add batch to GPU
          batch = tuple(t.to(device) for t in batch)
          
          # Unpack the inputs from our dataloader
          b_input_ids, b_input_mask, b_labels = batch
          
          # Telling the model not to compute or store gradients, saving memory and
          # speeding up validation
          with torch.no_grad():        

              # Forward pass, calculate logit predictions.
              # This will return the logits rather than the loss because we have
              # not provided labels.
              # token_type_ids is the same as the "segment ids", which 
              # differentiates sentence 1 and 2 in 2-sentence tasks.
              # The documentation for this `model` function is here: 
              # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
              outputs = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask)
          
          # Get the "logits" output by the model. The "logits" are the output
          # values prior to applying an activation function like the softmax.
          logits = outputs[0]

          # Move logits and labels to CPU
          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          
          # Calculate the accuracy for this batch of test sentences.
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)
          
          # Accumulate the total accuracy.
          eval_accuracy += tmp_eval_accuracy

          # Track the number of batches
          nb_eval_steps += 1

      # Report the final accuracy for this validation run.
      print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
      print("  Validation took: {:}".format(format_time(time.time() - t0)))

  print("")
  print("Training complete!")

  # Create the DataLoader for our validation set.
  test_data = TensorDataset(x_test, test_masks, y_test)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  # Prediction on test set

  print('Predicting labels for {:,} test sentences...'.format(len(x_test)))

  # Put model in evaluation mode
  model.eval()

  # Tracking variables 
  predictions , true_labels = [], []

  # Predict 
  for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

  print('DONE.')

  y_test = y_test.tolist()

  y_pred = []
  for i in predictions:
    y_pred += i.tolist()

  import pandas as pd

  y_pred = list(map(lambda x: pd.Series(x).idxmax(), y_pred))

  import sklearn.metrics as metrics

  path = text_path
  with open(path, "a") as text_file:
    print("Scicite ",file=text_file)
    print(str(seed),file=text_file)
    print(metrics.confusion_matrix(y_test, y_pred),file=text_file)
    print(metrics.classification_report(y_test, y_pred, digits=4),file=text_file)
    print("=======================================================================",file=text_file)

  torch.save(model.state_dict(), filepath)
  

In [0]:
#@title Method for aclarc dataset (6 Classes) aclarc(seed,filepath,text_path) -- 0 Shot



def aclarc(seed,filepath,text_path):

  import numpy as np
  import time
  import datetime

  def flat_accuracy(preds, labels):
      pred_flat = np.argmax(preds, axis=1).flatten()
      labels_flat = labels.flatten()
      return np.sum(pred_flat == labels_flat) / len(labels_flat)
      
  def format_time(elapsed):
      '''
      Takes a time in seconds and returns a string hh:mm:ss
      '''
      # Round to the nearest second.
      elapsed_rounded = int(round((elapsed)))
      fl
      # Format as hh:mm:ss
      return str(datetime.timedelta(seconds=elapsed_rounded))

  import random

  directory = DATA_DIR
  funcs_index = {'CompareOrContrast': 0, 'Background': 1, 'Motivation': 2, 'Uses': 3, 'Future': 4,
                'Extends': 5}

  # Function dataset start
  datafiles = DATA_FILES['acl-arc']
  test = read_jsonl_data(datafiles['test'])
  train = read_jsonl_data((datafiles['train']))


  dataset = list(filter(lambda x: x['intent'] != 'Error', test + train))

  texts = list(map(lambda d: d['text'], dataset))

  ys = list(map(lambda d: funcs_index[d['intent']], dataset))

  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

  from transformers import BertTokenizer

  # Load the BERT tokenizer.
  print('Loading BERT tokenizer...')
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []

  # For every sentence...
  for sample in texts:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sample,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                          # This function also supports truncation and conversion
                          # to pytorch tensors, but we need to do padding, so we
                          # can't use these features :( .
                          #max_length = 128,          # Truncate all sentences.
                          #return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      # Add the encoded sentence to the list.
      input_ids.append(encoded_sent)

  # Print sentence 0, now as a list of IDs.
  print('Original: ', texts[0])
  print('Token IDs:', input_ids[0])

  for i, sample in enumerate(input_ids):
    if len(sample) > 511:
      texts.pop(i)
      input_ids.pop(i)
      ys.pop(i)
      print("removed Index: " +  str(i))

  print('Max sentence length: ', max([len(sen) for sen in input_ids]))

  # We'll borrow the `pad_sequences` utility function to do this.
  from keras.preprocessing.sequence import pad_sequences

  # Set the maximum sequence length.
  # I've chosen 64 somewhat arbitrarily. It's slightly larger than the
  # maximum training sentence length of 47...
  MAX_LEN = 200

  print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

  print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

  # Pad our input tokens with value 0.
  # "post" indicates that we want to pad and truncate at the end of the sequence,
  # as opposed to the beginning.
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")

  print('\nDone.')

  # Create attention masks
  attention_masks = []

  # For each sentence...
  for sent in input_ids:
      
      # Create the attention mask.
      #   - If a token ID is 0, then it's padding, set the mask to 0.
      #   - If a token ID is > 0, then it's a real token, set the mask to 1.
      att_mask = [int(token_id > 0) for token_id in sent]
      
      # Store the attention mask for this sentence.
      attention_masks.append(att_mask)

  # Use train_test_split to split our data into train and validation sets for
  # training

  from sklearn.model_selection import train_test_split

  # Split into train(80%) and test(20%) sets
  x_train, x_test , y_train, y_test = train_test_split(input_ids, ys, 
                                                  random_state=seed, test_size=0.2)
  # Do the same for the masks.
  train_masks, test_masks, _, _ = train_test_split(attention_masks, ys,
                                              random_state=seed, test_size=0.2)

  y_train_unique, indices = np.unique(y_train, return_index=True)

  # Proportional Reduction
  # ------------------------------------

  new_x_train = []
  new_y_train = []
  new_train_mask = []
  arr = {}
  for index in range(len(funcs_index)):
      arr[index] = []
      for i, value in enumerate(y_train):
          if (value == index):
              arr[index].append(i)
      # print(index, ":", len(arr[index]))
      # sample_length = len(arr[index]) / 20  # 5% of data
      # sample_length = int(sample_length)
      sample_length = 5
      for j in range(sample_length):
          new_x_train.append(x_train[arr[index][j]])
          new_y_train.append(y_train[arr[index][j]])
          new_train_mask.append(train_masks[arr[index][j]])

  new_x_train = np.asarray(new_x_train)
  new_y_train = np.asarray(new_y_train)
  new_train_mask = np.asarray(new_train_mask)

  indices = np.arange(new_x_train.shape[0])
  np.random.shuffle(indices)

  new_x_train = new_x_train[indices]
  new_y_train = new_y_train[indices]
  new_train_mask = new_train_mask[indices]
  x_train = new_x_train
  y_train = new_y_train
  train_masks = new_train_mask
  #----------------------------------------------

  # Futher split train data into train(80%) and validation(20%) sets
  train_masks, val_masks , _, _ = train_test_split(train_masks, x_train,
                                                    random_state=seed, test_size=0.2)

  x_train, x_val , y_train, y_val = train_test_split(x_train, y_train,
                                                    random_state=seed, test_size=0.2)

  # Convert all inputs and labels into torch tensors, the required datatype 
  # for our model.
  x_train = torch.tensor(x_train)
  x_test = torch.tensor(x_test)
  x_val = torch.tensor(x_val)

  y_train = torch.tensor(y_train)
  y_test = torch.tensor(y_test)
  y_val = torch.tensor(y_val)

  train_masks = torch.tensor(train_masks)
  test_masks = torch.tensor(test_masks)
  val_masks = torch.tensor(val_masks)


  from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

  # The DataLoader needs to know our batch size for training, so we specify it 
  # here.
  # For fine-tuning BERT on a specific task, the authors recommend a batch size of
  # 16 or 32.

  batch_size = 16

  # Create the DataLoader for our training set.
  train_data = TensorDataset(x_train, train_masks, y_train)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  # Create the DataLoader for our validation set.
  validation_data = TensorDataset(x_val, val_masks, y_val)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  from transformers import BertForSequenceClassification, AdamW, BertConfig

  # Load BertForSequenceClassification, the pretrained BERT model with a single 
  # linear classification layer on top. 
  model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = 6, # The number of output labels--2 for binary classification.
                      # You can increase this for multi-class tasks.   
      output_attentions = False, # Whether the model returns attentions weights.
      output_hidden_states = False, # Whether the model returns all hidden-states.
  )

  model.classifier = torch.nn.Linear(768,3)
  path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'

  # model.load_state_dict(torch.load(filepath))

  model.classifier = torch.nn.Linear(768,6)

  # # Tell pytorch to run this model on the GPU.

  # model = BertForSequenceClassification.from_pretrained(
  #     "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
  #     num_labels = 3, # The number of output labels--2 for binary classification.
  #                     # You can increase this for multi-class tasks.   
  #     output_attentions = False, # Whether the model returns attentions weights.
  #     output_hidden_states = False, # Whether the model returns all hidden-states.
  # )
  # path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'

  # # model.load_state_dict(torch.load(path))

  # model.classifier = torch.nn.Linear(768,6)

  model.cuda()

  optimizer = AdamW(model.parameters(),
                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                  )

  from transformers import get_linear_schedule_with_warmup

  # Number of training epochs (authors recommend between 2 and 4)
  epochs =  0

  # Total number of training steps is number of batches * number of epochs.
  total_steps = len(train_dataloader) * epochs

  # Create the learning rate scheduler.
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0, # Default value in run_glue.py
                                              num_training_steps = total_steps)



  # This training code is based on the `run_glue.py` script here:
  # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128




  # Store the average loss after each epoch so we can plot them.
  loss_values = []

  # For each epoch...
  for epoch_i in range(0, epochs):
      
      # ========================================
      #               Training
      # ========================================
      
      # Perform one full pass over the training set.

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')
      tf.reset_default_graph()

      # Measure how long the training epoch takes.
      t0 = time.time()

      # Reset the total loss for this epoch.
      total_loss = 0

      # Put the model into training mode. Don't be mislead--the call to 
      # `train` just changes the *mode*, it doesn't *perform* the training.
      # `dropout` and `batchnorm` layers behave differently during training
      # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
      model.train()

      # For each batch of training data...
      for step, batch in enumerate(train_dataloader):

          # Progress update every 40 batches.
          if step % 40 == 0 and not step == 0:
              # Calculate elapsed time in minutes.
              elapsed = format_time(time.time() - t0)
              
              # Report progress.
              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

          # Unpack this training batch from our dataloader. 
          #
          # As we unpack the batch, we'll also copy each tensor to the GPU using the 
          # `to` method.
          #
          # `batch` contains three pytorch tensors:
          #   [0]: input ids 
          #   [1]: attention masks
          #   [2]: labels 
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          # Always clear any previously calculated gradients before performing a
          # backward pass. PyTorch doesn't do this automatically because 
          # accumulating the gradients is "convenient while training RNNs". 
          # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
          model.zero_grad()        

          # Perform a forward pass (evaluate the model on this training batch).
          # This will return the loss (rather than the model output) because we
          # have provided the `labels`.
          # The documentation for this `model` function is here: 
          # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
          outputs = model(b_input_ids, 
                      token_type_ids=None, 
                      attention_mask=b_input_mask, 
                      labels=b_labels)
          
          # The call to `model` always returns a tuple, so we need to pull the 
          # loss value out of the tuple.
          loss = outputs[0]

          # Accumulate the training loss over all of the batches so that we can
          # calculate the average loss at the end. `loss` is a Tensor containing a
          # single value; the `.item()` function just returns the Python value 
          # from the tensor.
          total_loss += loss.item()

          # Perform a backward pass to calculate the gradients.
          loss.backward()

          # Clip the norm of the gradients to 1.0.
          # This is to help prevent the "exploding gradients" problem.
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          # Update parameters and take a step using the computed gradient.
          # The optimizer dictates the "update rule"--how the parameters are
          # modified based on their gradients, the learning rate, etc.
          optimizer.step()

          # Update the learning rate.
          scheduler.step()

      # Calculate the average loss over the training data.
      avg_train_loss = total_loss / len(train_dataloader)            
      
      # Store the loss value for plotting the learning curve.
      loss_values.append(avg_train_loss)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
          
      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      print("")
      print("Running Validation...")

      t0 = time.time()

      # Put the model in evaluation mode--the dropout layers behave differently
      # during evaluation.
      model.eval()

      # Tracking variables 
      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      # Evaluate data for one epoch
      for batch in validation_dataloader:
          
          # Add batch to GPU
          batch = tuple(t.to(device) for t in batch)
          
          # Unpack the inputs from our dataloader
          b_input_ids, b_input_mask, b_labels = batch
          
          # Telling the model not to compute or store gradients, saving memory and
          # speeding up validation
          with torch.no_grad():        

              # Forward pass, calculate logit predictions.
              # This will return the logits rather than the loss because we have
              # not provided labels.
              # token_type_ids is the same as the "segment ids", which 
              # differentiates sentence 1 and 2 in 2-sentence tasks.
              # The documentation for this `model` function is here: 
              # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
              outputs = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask)
          
          # Get the "logits" output by the model. The "logits" are the output
          # values prior to applying an activation function like the softmax.
          logits = outputs[0]

          # Move logits and labels to CPU
          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          
          # Calculate the accuracy for this batch of test sentences.
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)
          
          # Accumulate the total accuracy.
          eval_accuracy += tmp_eval_accuracy

          # Track the number of batches
          nb_eval_steps += 1

      # Report the final accuracy for this validation run.
      print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
      print("  Validation took: {:}".format(format_time(time.time() - t0)))

  print("")
  print("Training complete!")

  # Create the DataLoader for our validation set.
  test_data = TensorDataset(x_test, test_masks, y_test)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  # Prediction on test set

  print('Predicting labels for {:,} test sentences...'.format(len(x_test)))

  # Put model in evaluation mode
  model.eval()

  # Tracking variables 
  predictions , true_labels = [], []

  # Predict 
  for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

  print('DONE.')

  y_test = y_test.tolist()

  y_pred = []
  for i in predictions:
    y_pred += i.tolist()

  import pandas as pd

  y_pred = list(map(lambda x: pd.Series(x).idxmax(), y_pred))

  import sklearn.metrics as metrics

  path = text_path

  with open(path, "a") as text_file:
    print("Aclarc test zero shot ",file=text_file)
    print(str(seed),file=text_file)
    print(metrics.confusion_matrix(y_test, y_pred),file=text_file)
    print(metrics.classification_report(y_test, y_pred, digits=4),file=text_file)
    print("=======================================================================",file=text_file)


In [0]:
#@title Method for aclarc dataset (6 Classes) aclarcf(seed,filepath,text_path) -- 5 Shot


def aclarcf(seed,filepath,text_path):

  import numpy as np
  import time
  import datetime

  def flat_accuracy(preds, labels):
      pred_flat = np.argmax(preds, axis=1).flatten()
      labels_flat = labels.flatten()
      return np.sum(pred_flat == labels_flat) / len(labels_flat)
      
  def format_time(elapsed):
      '''
      Takes a time in seconds and returns a string hh:mm:ss
      '''
      # Round to the nearest second.
      elapsed_rounded = int(round((elapsed)))
      
      # Format as hh:mm:ss
      return str(datetime.timedelta(seconds=elapsed_rounded))

  import random

  directory = DATA_DIR
  funcs_index = {'CompareOrContrast': 0, 'Background': 1, 'Motivation': 2, 'Uses': 3, 'Future': 4,
                'Extends': 5}

  # Function dataset start
  datafiles = DATA_FILES['acl-arc']
  test = read_jsonl_data(datafiles['test'])
  train = read_jsonl_data((datafiles['train']))


  dataset = list(filter(lambda x: x['intent'] != 'Error', test + train))

  texts = list(map(lambda d: d['text'], dataset))

  ys = list(map(lambda d: funcs_index[d['intent']], dataset))

  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

  from transformers import BertTokenizer

  # Load the BERT tokenizer.
  print('Loading BERT tokenizer...')
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []

  # For every sentence...
  for sample in texts:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sample,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                          # This function also supports truncation and conversion
                          # to pytorch tensors, but we need to do padding, so we
                          # can't use these features :( .
                          #max_length = 128,          # Truncate all sentences.
                          #return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      # Add the encoded sentence to the list.
      input_ids.append(encoded_sent)

  # Print sentence 0, now as a list of IDs.
  print('Original: ', texts[0])
  print('Token IDs:', input_ids[0])

  for i, sample in enumerate(input_ids):
    if len(sample) > 511:
      texts.pop(i)
      input_ids.pop(i)
      ys.pop(i)
      print("removed Index: " +  str(i))

  print('Max sentence length: ', max([len(sen) for sen in input_ids]))

  # We'll borrow the `pad_sequences` utility function to do this.
  from keras.preprocessing.sequence import pad_sequences

  # Set the maximum sequence length.
  # I've chosen 64 somewhat arbitrarily. It's slightly larger than the
  # maximum training sentence length of 47...
  MAX_LEN = 200

  print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

  print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

  # Pad our input tokens with value 0.
  # "post" indicates that we want to pad and truncate at the end of the sequence,
  # as opposed to the beginning.
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")

  print('\nDone.')

  # Create attention masks
  attention_masks = []

  # For each sentence...
  for sent in input_ids:
      
      # Create the attention mask.
      #   - If a token ID is 0, then it's padding, set the mask to 0.
      #   - If a token ID is > 0, then it's a real token, set the mask to 1.
      att_mask = [int(token_id > 0) for token_id in sent]
      
      # Store the attention mask for this sentence.
      attention_masks.append(att_mask)

  # Use train_test_split to split our data into train and validation sets for
  # training

  from sklearn.model_selection import train_test_split

  # Split into train(80%) and test(20%) sets
  x_train, x_test , y_train, y_test = train_test_split(input_ids, ys, 
                                                  random_state=seed, test_size=0.2)
  # Do the same for the masks.
  train_masks, test_masks, _, _ = train_test_split(attention_masks, ys,
                                              random_state=seed, test_size=0.2)

  y_train_unique, indices = np.unique(y_train, return_index=True)

  # Proportional Reduction
  # ------------------------------------

  new_x_train = []
  new_y_train = []
  new_train_mask = []
  arr = {}
  for index in range(len(funcs_index)):
      arr[index] = []
      for i, value in enumerate(y_train):
          if (value == index):
              arr[index].append(i)
      # print(index, ":", len(arr[index]))
      # sample_length = len(arr[index]) / 20  # 5% of data
      # sample_length = int(sample_length)
      sample_length = 5
      for j in range(sample_length):
          new_x_train.append(x_train[arr[index][j]])
          new_y_train.append(y_train[arr[index][j]])
          new_train_mask.append(train_masks[arr[index][j]])

  new_x_train = np.asarray(new_x_train)
  new_y_train = np.asarray(new_y_train)
  new_train_mask = np.asarray(new_train_mask)

  indices = np.arange(new_x_train.shape[0])
  np.random.shuffle(indices)

  new_x_train = new_x_train[indices]
  new_y_train = new_y_train[indices]
  new_train_mask = new_train_mask[indices]
  x_train = new_x_train
  y_train = new_y_train
  train_masks = new_train_mask
  #----------------------------------------------

  # Futher split train data into train(80%) and validation(20%) sets
  train_masks, val_masks , _, _ = train_test_split(train_masks, x_train,
                                                    random_state=seed, test_size=0.2)

  x_train, x_val , y_train, y_val = train_test_split(x_train, y_train,
                                                    random_state=seed, test_size=0.2)

  # Convert all inputs and labels into torch tensors, the required datatype 
  # for our model.
  x_train = torch.tensor(x_train)
  x_test = torch.tensor(x_test)
  x_val = torch.tensor(x_val)

  y_train = torch.tensor(y_train)
  y_test = torch.tensor(y_test)
  y_val = torch.tensor(y_val)

  train_masks = torch.tensor(train_masks)
  test_masks = torch.tensor(test_masks)
  val_masks = torch.tensor(val_masks)


  from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

  # The DataLoader needs to know our batch size for training, so we specify it 
  # here.
  # For fine-tuning BERT on a specific task, the authors recommend a batch size of
  # 16 or 32.

  batch_size = 16

  # Create the DataLoader for our training set.
  train_data = TensorDataset(x_train, train_masks, y_train)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  # Create the DataLoader for our validation set.
  validation_data = TensorDataset(x_val, val_masks, y_val)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  from transformers import BertForSequenceClassification, AdamW, BertConfig

  # Load BertForSequenceClassification, the pretrained BERT model with a single 
  # linear classification layer on top. 
  model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = 6, # The number of output labels--2 for binary classification.
                      # You can increase this for multi-class tasks.   
      output_attentions = False, # Whether the model returns attentions weights.
      output_hidden_states = False, # Whether the model returns all hidden-states.
  )

  model.classifier = torch.nn.Linear(768,3)
  path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'

  # model.load_state_dict(torch.load(filepath))

  model.classifier = torch.nn.Linear(768,6)

  # # Tell pytorch to run this model on the GPU.

  # model = BertForSequenceClassification.from_pretrained(
  #     "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
  #     num_labels = 3, # The number of output labels--2 for binary classification.
  #                     # You can increase this for multi-class tasks.   
  #     output_attentions = False, # Whether the model returns attentions weights.
  #     output_hidden_states = False, # Whether the model returns all hidden-states.
  # )
  # path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'

  # # model.load_state_dict(torch.load(path))

  # model.classifier = torch.nn.Linear(768,6)

  model.cuda()

  optimizer = AdamW(model.parameters(),
                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                  )

  from transformers import get_linear_schedule_with_warmup

  # Number of training epochs (authors recommend between 2 and 4)
  epochs =  15

  # Total number of training steps is number of batches * number of epochs.
  total_steps = len(train_dataloader) * epochs

  # Create the learning rate scheduler.
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0, # Default value in run_glue.py
                                              num_training_steps = total_steps)



  # This training code is based on the `run_glue.py` script here:
  # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128




  # Store the average loss after each epoch so we can plot them.
  loss_values = []

  # For each epoch...
  for epoch_i in range(0, epochs):
      
      # ========================================
      #               Training
      # ========================================
      
      # Perform one full pass over the training set.

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')
      tf.reset_default_graph()

      # Measure how long the training epoch takes.
      t0 = time.time()

      # Reset the total loss for this epoch.
      total_loss = 0

      # Put the model into training mode. Don't be mislead--the call to 
      # `train` just changes the *mode*, it doesn't *perform* the training.
      # `dropout` and `batchnorm` layers behave differently during training
      # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
      model.train()

      # For each batch of training data...
      for step, batch in enumerate(train_dataloader):

          # Progress update every 40 batches.
          if step % 40 == 0 and not step == 0:
              # Calculate elapsed time in minutes.
              elapsed = format_time(time.time() - t0)
              
              # Report progress.
              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

          # Unpack this training batch from our dataloader. 
          #
          # As we unpack the batch, we'll also copy each tensor to the GPU using the 
          # `to` method.
          #
          # `batch` contains three pytorch tensors:
          #   [0]: input ids 
          #   [1]: attention masks
          #   [2]: labels 
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          # Always clear any previously calculated gradients before performing a
          # backward pass. PyTorch doesn't do this automatically because 
          # accumulating the gradients is "convenient while training RNNs". 
          # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
          model.zero_grad()        

          # Perform a forward pass (evaluate the model on this training batch).
          # This will return the loss (rather than the model output) because we
          # have provided the `labels`.
          # The documentation for this `model` function is here: 
          # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
          outputs = model(b_input_ids, 
                      token_type_ids=None, 
                      attention_mask=b_input_mask, 
                      labels=b_labels)
          
          # The call to `model` always returns a tuple, so we need to pull the 
          # loss value out of the tuple.
          loss = outputs[0]

          # Accumulate the training loss over all of the batches so that we can
          # calculate the average loss at the end. `loss` is a Tensor containing a
          # single value; the `.item()` function just returns the Python value 
          # from the tensor.
          total_loss += loss.item()

          # Perform a backward pass to calculate the gradients.
          loss.backward()

          # Clip the norm of the gradients to 1.0.
          # This is to help prevent the "exploding gradients" problem.
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          # Update parameters and take a step using the computed gradient.
          # The optimizer dictates the "update rule"--how the parameters are
          # modified based on their gradients, the learning rate, etc.
          optimizer.step()

          # Update the learning rate.
          scheduler.step()

      # Calculate the average loss over the training data.
      avg_train_loss = total_loss / len(train_dataloader)            
      
      # Store the loss value for plotting the learning curve.
      loss_values.append(avg_train_loss)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
          
      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      print("")
      print("Running Validation...")

      t0 = time.time()

      # Put the model in evaluation mode--the dropout layers behave differently
      # during evaluation.
      model.eval()

      # Tracking variables 
      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      # Evaluate data for one epoch
      for batch in validation_dataloader:
          
          # Add batch to GPU
          batch = tuple(t.to(device) for t in batch)
          
          # Unpack the inputs from our dataloader
          b_input_ids, b_input_mask, b_labels = batch
          
          # Telling the model not to compute or store gradients, saving memory and
          # speeding up validation
          with torch.no_grad():        

              # Forward pass, calculate logit predictions.
              # This will return the logits rather than the loss because we have
              # not provided labels.
              # token_type_ids is the same as the "segment ids", which 
              # differentiates sentence 1 and 2 in 2-sentence tasks.
              # The documentation for this `model` function is here: 
              # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
              outputs = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask)
          
          # Get the "logits" output by the model. The "logits" are the output
          # values prior to applying an activation function like the softmax.
          logits = outputs[0]

          # Move logits and labels to CPU
          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          
          # Calculate the accuracy for this batch of test sentences.
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)
          
          # Accumulate the total accuracy.
          eval_accuracy += tmp_eval_accuracy

          # Track the number of batches
          nb_eval_steps += 1

      # Report the final accuracy for this validation run.
      print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
      print("  Validation took: {:}".format(format_time(time.time() - t0)))

  print("")
  print("Training complete!")

  # Create the DataLoader for our validation set.
  test_data = TensorDataset(x_test, test_masks, y_test)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  # Prediction on test set

  print('Predicting labels for {:,} test sentences...'.format(len(x_test)))

  # Put model in evaluation mode
  model.eval()

  # Tracking variables 
  predictions , true_labels = [], []

  # Predict 
  for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

  print('DONE.')

  y_test = y_test.tolist()

  y_pred = []
  for i in predictions:
    y_pred += i.tolist()

  import pandas as pd

  y_pred = list(map(lambda x: pd.Series(x).idxmax(), y_pred))

  import sklearn.metrics as metrics

  path = text_path

  with open(path, "a") as text_file:
    print("Aclarc test Few shot ",file=text_file)
    print(str(seed),file=text_file)
    print(metrics.confusion_matrix(y_test, y_pred),file=text_file)
    print(metrics.classification_report(y_test, y_pred, digits=4),file=text_file)
    print("=======================================================================",file=text_file)


In [0]:
#@title Method for aclarc dataset (6 Classes) aclarcfull(seed,filepath,text_path) -- full Shot


def aclarcfull(seed,filepath,text_path):

  import numpy as np
  import time
  import datetime

  def flat_accuracy(preds, labels):
      pred_flat = np.argmax(preds, axis=1).flatten()
      labels_flat = labels.flatten()
      return np.sum(pred_flat == labels_flat) / len(labels_flat)
      
  def format_time(elapsed):
      '''
      Takes a time in seconds and returns a string hh:mm:ss
      '''
      # Round to the nearest second.
      elapsed_rounded = int(round((elapsed)))
      
      # Format as hh:mm:ss
      return str(datetime.timedelta(seconds=elapsed_rounded))

  import random

  directory = DATA_DIR
  funcs_index = {'CompareOrContrast': 0, 'Background': 1, 'Motivation': 2, 'Uses': 3, 'Future': 4,
                'Extends': 5}

  # Function dataset start
  datafiles = DATA_FILES['acl-arc']
  test = read_jsonl_data(datafiles['test'])
  train = read_jsonl_data((datafiles['train']))


  dataset = list(filter(lambda x: x['intent'] != 'Error', test + train))

  texts = list(map(lambda d: d['text'], dataset))

  ys = list(map(lambda d: funcs_index[d['intent']], dataset))

  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

  from transformers import BertTokenizer

  # Load the BERT tokenizer.
  print('Loading BERT tokenizer...')
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []

  # For every sentence...
  for sample in texts:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sample,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                          # This function also supports truncation and conversion
                          # to pytorch tensors, but we need to do padding, so we
                          # can't use these features :( .
                          #max_length = 128,          # Truncate all sentences.
                          #return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      # Add the encoded sentence to the list.
      input_ids.append(encoded_sent)

  # Print sentence 0, now as a list of IDs.
  print('Original: ', texts[0])
  print('Token IDs:', input_ids[0])

  for i, sample in enumerate(input_ids):
    if len(sample) > 511:
      texts.pop(i)
      input_ids.pop(i)
      ys.pop(i)
      print("removed Index: " +  str(i))

  print('Max sentence length: ', max([len(sen) for sen in input_ids]))

  # We'll borrow the `pad_sequences` utility function to do this.
  from keras.preprocessing.sequence import pad_sequences

  # Set the maximum sequence length.
  # I've chosen 64 somewhat arbitrarily. It's slightly larger than the
  # maximum training sentence length of 47...
  MAX_LEN = 200

  print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

  print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

  # Pad our input tokens with value 0.
  # "post" indicates that we want to pad and truncate at the end of the sequence,
  # as opposed to the beginning.
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")

  print('\nDone.')

  # Create attention masks
  attention_masks = []

  # For each sentence...
  for sent in input_ids:
      
      # Create the attention mask.
      #   - If a token ID is 0, then it's padding, set the mask to 0.
      #   - If a token ID is > 0, then it's a real token, set the mask to 1.
      att_mask = [int(token_id > 0) for token_id in sent]
      
      # Store the attention mask for this sentence.
      attention_masks.append(att_mask)

  # Use train_test_split to split our data into train and validation sets for
  # training

  from sklearn.model_selection import train_test_split

  # Split into train(80%) and test(20%) sets
  x_train, x_test , y_train, y_test = train_test_split(input_ids, ys, 
                                                  random_state=seed, test_size=0.2)
  # Do the same for the masks.
  train_masks, test_masks, _, _ = train_test_split(attention_masks, ys,
                                              random_state=seed, test_size=0.2)

  y_train_unique, indices = np.unique(y_train, return_index=True)

  # Proportional Reduction
  # ------------------------------------

  # new_x_train = []
  # new_y_train = []
  # new_train_mask = []
  # arr = {}
  # for index in range(len(funcs_index)):
  #     arr[index] = []
  #     for i, value in enumerate(y_train):
  #         if (value == index):
  #             arr[index].append(i)
  #     # print(index, ":", len(arr[index]))
  #     # sample_length = len(arr[index]) / 20  # 5% of data
  #     # sample_length = int(sample_length)
  #     sample_length = 5
  #     for j in range(sample_length):
  #         new_x_train.append(x_train[arr[index][j]])
  #         new_y_train.append(y_train[arr[index][j]])
  #         new_train_mask.append(train_masks[arr[index][j]])

  # new_x_train = np.asarray(new_x_train)
  # new_y_train = np.asarray(new_y_train)
  # new_train_mask = np.asarray(new_train_mask)

  # indices = np.arange(new_x_train.shape[0])
  # np.random.shuffle(indices)

  # new_x_train = new_x_train[indices]
  # new_y_train = new_y_train[indices]
  # new_train_mask = new_train_mask[indices]
  # x_train = new_x_train
  # y_train = new_y_train
  # train_masks = new_train_mask
  #----------------------------------------------

  # Futher split train data into train(80%) and validation(20%) sets
  train_masks, val_masks , _, _ = train_test_split(train_masks, x_train,
                                                    random_state=seed, test_size=0.2)

  x_train, x_val , y_train, y_val = train_test_split(x_train, y_train,
                                                    random_state=seed, test_size=0.2)

  # Convert all inputs and labels into torch tensors, the required datatype 
  # for our model.
  x_train = torch.tensor(x_train)
  x_test = torch.tensor(x_test)
  x_val = torch.tensor(x_val)

  y_train = torch.tensor(y_train)
  y_test = torch.tensor(y_test)
  y_val = torch.tensor(y_val)

  train_masks = torch.tensor(train_masks)
  test_masks = torch.tensor(test_masks)
  val_masks = torch.tensor(val_masks)


  from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

  # The DataLoader needs to know our batch size for training, so we specify it 
  # here.
  # For fine-tuning BERT on a specific task, the authors recommend a batch size of
  # 16 or 32.

  batch_size = 16

  # Create the DataLoader for our training set.
  train_data = TensorDataset(x_train, train_masks, y_train)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  # Create the DataLoader for our validation set.
  validation_data = TensorDataset(x_val, val_masks, y_val)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  from transformers import BertForSequenceClassification, AdamW, BertConfig

  # Load BertForSequenceClassification, the pretrained BERT model with a single 
  # linear classification layer on top. 
  model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = 6, # The number of output labels--2 for binary classification.
                      # You can increase this for multi-class tasks.   
      output_attentions = False, # Whether the model returns attentions weights.
      output_hidden_states = False, # Whether the model returns all hidden-states.
  )

  model.classifier = torch.nn.Linear(768,3)
  path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'

  # model.load_state_dict(torch.load(filepath))

  model.classifier = torch.nn.Linear(768,6)

  # # Tell pytorch to run this model on the GPU.

  # model = BertForSequenceClassification.from_pretrained(
  #     "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
  #     num_labels = 3, # The number of output labels--2 for binary classification.
  #                     # You can increase this for multi-class tasks.   
  #     output_attentions = False, # Whether the model returns attentions weights.
  #     output_hidden_states = False, # Whether the model returns all hidden-states.
  # )
  # path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'

  # # model.load_state_dict(torch.load(path))

  # model.classifier = torch.nn.Linear(768,6)

  model.cuda()

  optimizer = AdamW(model.parameters(),
                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                  )

  from transformers import get_linear_schedule_with_warmup

  # Number of training epochs (authors recommend between 2 and 4)
  epochs =  2

  # Total number of training steps is number of batches * number of epochs.
  total_steps = len(train_dataloader) * epochs

  # Create the learning rate scheduler.
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0, # Default value in run_glue.py
                                              num_training_steps = total_steps)



  # This training code is based on the `run_glue.py` script here:
  # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128




  # Store the average loss after each epoch so we can plot them.
  loss_values = []

  # For each epoch...
  for epoch_i in range(0, epochs):
      
      # ========================================
      #               Training
      # ========================================
      
      # Perform one full pass over the training set.

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')
      tf.reset_default_graph()

      # Measure how long the training epoch takes.
      t0 = time.time()

      # Reset the total loss for this epoch.
      total_loss = 0

      # Put the model into training mode. Don't be mislead--the call to 
      # `train` just changes the *mode*, it doesn't *perform* the training.
      # `dropout` and `batchnorm` layers behave differently during training
      # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
      model.train()

      # For each batch of training data...
      for step, batch in enumerate(train_dataloader):

          # Progress update every 40 batches.
          if step % 40 == 0 and not step == 0:
              # Calculate elapsed time in minutes.
              elapsed = format_time(time.time() - t0)
              
              # Report progress.
              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

          # Unpack this training batch from our dataloader. 
          #
          # As we unpack the batch, we'll also copy each tensor to the GPU using the 
          # `to` method.
          #
          # `batch` contains three pytorch tensors:
          #   [0]: input ids 
          #   [1]: attention masks
          #   [2]: labels 
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          # Always clear any previously calculated gradients before performing a
          # backward pass. PyTorch doesn't do this automatically because 
          # accumulating the gradients is "convenient while training RNNs". 
          # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
          model.zero_grad()        

          # Perform a forward pass (evaluate the model on this training batch).
          # This will return the loss (rather than the model output) because we
          # have provided the `labels`.
          # The documentation for this `model` function is here: 
          # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
          outputs = model(b_input_ids, 
                      token_type_ids=None, 
                      attention_mask=b_input_mask, 
                      labels=b_labels)
          
          # The call to `model` always returns a tuple, so we need to pull the 
          # loss value out of the tuple.
          loss = outputs[0]

          # Accumulate the training loss over all of the batches so that we can
          # calculate the average loss at the end. `loss` is a Tensor containing a
          # single value; the `.item()` function just returns the Python value 
          # from the tensor.
          total_loss += loss.item()

          # Perform a backward pass to calculate the gradients.
          loss.backward()

          # Clip the norm of the gradients to 1.0.
          # This is to help prevent the "exploding gradients" problem.
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          # Update parameters and take a step using the computed gradient.
          # The optimizer dictates the "update rule"--how the parameters are
          # modified based on their gradients, the learning rate, etc.
          optimizer.step()

          # Update the learning rate.
          scheduler.step()

      # Calculate the average loss over the training data.
      avg_train_loss = total_loss / len(train_dataloader)            
      
      # Store the loss value for plotting the learning curve.
      loss_values.append(avg_train_loss)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
          
      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      print("")
      print("Running Validation...")

      t0 = time.time()

      # Put the model in evaluation mode--the dropout layers behave differently
      # during evaluation.
      model.eval()

      # Tracking variables 
      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      # Evaluate data for one epoch
      for batch in validation_dataloader:
          
          # Add batch to GPU
          batch = tuple(t.to(device) for t in batch)
          
          # Unpack the inputs from our dataloader
          b_input_ids, b_input_mask, b_labels = batch
          
          # Telling the model not to compute or store gradients, saving memory and
          # speeding up validation
          with torch.no_grad():        

              # Forward pass, calculate logit predictions.
              # This will return the logits rather than the loss because we have
              # not provided labels.
              # token_type_ids is the same as the "segment ids", which 
              # differentiates sentence 1 and 2 in 2-sentence tasks.
              # The documentation for this `model` function is here: 
              # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
              outputs = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask)
          
          # Get the "logits" output by the model. The "logits" are the output
          # values prior to applying an activation function like the softmax.
          logits = outputs[0]

          # Move logits and labels to CPU
          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          
          # Calculate the accuracy for this batch of test sentences.
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)
          
          # Accumulate the total accuracy.
          eval_accuracy += tmp_eval_accuracy

          # Track the number of batches
          nb_eval_steps += 1

      # Report the final accuracy for this validation run.
      print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
      print("  Validation took: {:}".format(format_time(time.time() - t0)))

  print("")
  print("Training complete!")

  # Create the DataLoader for our validation set.
  test_data = TensorDataset(x_test, test_masks, y_test)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  # Prediction on test set

  print('Predicting labels for {:,} test sentences...'.format(len(x_test)))

  # Put model in evaluation mode
  model.eval()

  # Tracking variables 
  predictions , true_labels = [], []

  # Predict 
  for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

  print('DONE.')

  y_test = y_test.tolist()

  y_pred = []
  for i in predictions:
    y_pred += i.tolist()

  import pandas as pd

  y_pred = list(map(lambda x: pd.Series(x).idxmax(), y_pred))

  import sklearn.metrics as metrics

  path = text_path

  with open(path, "a") as text_file:
    print("Aclarc test full ",file=text_file)
    print(str(seed),file=text_file)
    print(metrics.confusion_matrix(y_test, y_pred),file=text_file)
    print(metrics.classification_report(y_test, y_pred, digits=4),file=text_file)
    print("=======================================================================",file=text_file)


In [0]:

seeds = [663,883,544,201,356,648,898,88,997,788]
# seeds = [1,2]

for seed in seeds:
  path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_'
  text_path = '/content/drive/My Drive/KY, FYP/Code/results/output_'
  path = path + str(seed) + ".pt"
  text_path = text_path + str(seed) + ".txt"
  aclarc(seed,path,text_path)
  aclarcf(seed,path,text_path)
  aclarcfull(seed,path,text_path)


In [0]:
path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'
text_path = '/content/drive/My Drive/KY, FYP/Code/results/output_663.txt'

aclarcf(0,path,text_path)


In [0]:
aclarcfull(0,path,text_path)

In [0]:
#@title aclarc dataset (6 Classes) Initialization Transfer test


import numpy as np
import time
import datetime
import random

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


directory = DATA_DIR
funcs_index = {'CompareOrContrast': 0, 'Background': 1, 'Motivation': 2, 'Uses': 3, 'Future': 4,
               'Extends': 5}

# Function dataset start
datafiles = DATA_FILES['acl-arc']
test = read_jsonl_data(datafiles['test'])
train = read_jsonl_data((datafiles['train']))


dataset = list(filter(lambda x: x['intent'] != 'Error', test + train))

texts = list(map(lambda d: d['text'], dataset))

ys = list(map(lambda d: funcs_index[d['intent']], dataset))

seed = 2
np.random.seed(seed)
# tf.python.control_flow_ops = tf
tf.compat.v1.set_random_seed(seed)

from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sample in texts:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sample,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

for i, sample in enumerate(input_ids):
  if len(sample) > 511:
    texts.pop(i)
    input_ids.pop(i)
    ys.pop(i)
    print("removed Index: " +  str(i))

print('Max sentence length: ', max([len(sen) for sen in input_ids]))

# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 47...
MAX_LEN = 200

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

print('\nDone.')

# Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

# Use train_test_split to split our data into train and validation sets for
# training

from sklearn.model_selection import train_test_split

# Split into train(80%) and test(20%) sets
x_train, x_test , y_train, y_test = train_test_split(input_ids, ys, 
                                                random_state=seed, test_size=0.2)
# Do the same for the masks.
train_masks, test_masks, _, _ = train_test_split(attention_masks, ys,
                                             random_state=seed, test_size=0.2)

y_train_unique, indices = np.unique(y_train, return_index=True)

# Proportional Reduction
# ------------------------------------

new_x_train = []
new_y_train = []
new_train_mask = []
arr = {}
for index in range(len(funcs_index)):
    arr[index] = []
    for i, value in enumerate(y_train):
        if (value == index):
            arr[index].append(i)
    # print(index, ":", len(arr[index]))
    # sample_length = len(arr[index]) / 20  # 5% of data
    # sample_length = int(sample_length)
    sample_length = 5
    for j in range(sample_length):
        new_x_train.append(x_train[arr[index][j]])
        new_y_train.append(y_train[arr[index][j]])
        new_train_mask.append(train_masks[arr[index][j]])

new_x_train = np.asarray(new_x_train)
new_y_train = np.asarray(new_y_train)
new_train_mask = np.asarray(new_train_mask)

indices = np.arange(new_x_train.shape[0])
np.random.shuffle(indices)

new_x_train = new_x_train[indices]
new_y_train = new_y_train[indices]
new_train_mask = new_train_mask[indices]
x_train = new_x_train
y_train = new_y_train
train_masks = new_train_mask
#----------------------------------------------

# Futher split train data into train(80%) and validation(20%) sets
train_masks, val_masks , _, _ = train_test_split(train_masks, x_train,
                                                   random_state=seed, test_size=0.2)

x_train, x_val , y_train, y_val = train_test_split(x_train, y_train,
                                                   random_state=seed, test_size=0.2)

# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
x_train = torch.tensor(x_train)
x_test = torch.tensor(x_test)
x_val = torch.tensor(x_val)

y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)
y_val = torch.tensor(y_val)

train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)
val_masks = torch.tensor(val_masks)


from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(x_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(x_val, val_masks, y_val)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 6, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.classifier = torch.nn.Linear(768,3)
path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'

model.load_state_dict(torch.load(path))

model.classifier = torch.nn.Linear(768,6)

# # Tell pytorch to run this model on the GPU.

# model = BertForSequenceClassification.from_pretrained(
#     "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
#     num_labels = 3, # The number of output labels--2 for binary classification.
#                     # You can increase this for multi-class tasks.   
#     output_attentions = False, # Whether the model returns attentions weights.
#     output_hidden_states = False, # Whether the model returns all hidden-states.
# )
# path = '/content/drive/My Drive/KY, FYP/Code/bert_models/bert_663.pt'

# # model.load_state_dict(torch.load(path))

# model.classifier = torch.nn.Linear(768,6)

model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs =  2

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)



# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 16

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    tf.reset_default_graph()

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

# Create the DataLoader for our validation set.
test_data = TensorDataset(x_test, test_masks, y_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(x_test)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('DONE.')

y_test = y_test.tolist()

y_pred = []
for i in predictions:
  y_pred += i.tolist()

import pandas as pd

y_pred = list(map(lambda x: pd.Series(x).idxmax(), y_pred))

import sklearn.metrics as metrics

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits=4))

In [0]:
y_train

In [0]:
len(new_x_train)