In [1]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")

Found device: Tesla P100-PCIE-16GB, n_gpu: 1


In [2]:
!pip install transformers
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

data_file = drive.CreateFile({'id': '1zeo8FcaNUnhN660mGMNEAPvxOE4DPOnE'})
data_file.GetContentFile('hw1.zip')

# Extract data from the zipfile and put it into the current directory
with zipfile.ZipFile('hw1.zip', 'r') as zip_file:
    zip_file.extractall('./')
os.remove('hw1.zip')
# We will use hw1 as our working directory
os.chdir('hw1')
print("Data and supporting code downloaded!")

pretrained_models_dir = './pretrained_models_dir'
if not os.path.isdir(pretrained_models_dir):
  os.mkdir(pretrained_models_dir)   # directory to save pretrained models
print('model directory created')

!pip install -r requirements.txt
print('everything set up!')

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 7.8MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 28.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 45.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=a931a5411723480d8fc

In [3]:
# coding=utf-8

import dataclasses
import logging
import math
import os
import timeit
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Tuple, Optional

import numpy as np
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm

from transformers import (
    AutoConfig,
    AutoModelWithLMHead,
    AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering,
    AutoModelForTokenClassification,
    AutoTokenizer,
    PreTrainedTokenizer,
    EvalPrediction
)
from transformers import (
    GlueDataset,
    SquadDataset,
    LineByLineTextDataset,
    TextDataset,
    DataCollatorForLanguageModeling,
)
from transformers import GlueDataTrainingArguments, SquadDataTrainingArguments
from transformers import (
    Trainer,
    TrainingArguments,
    glue_compute_metrics,
    glue_output_modes,
    glue_tasks_num_labels,
    set_seed,
)
from transformers.data.processors.squad import SquadResult
from transformers.data.metrics.squad_metrics import (
    compute_predictions_logits,
    squad_evaluate,
)
from tasks import NER
from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask

from transformers import glue_processors
from transformers.data.processors.utils import InputExample
from langdetect import detect

logger = logging.getLogger(__name__)


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """
    model_type: str = field(
        default="bert",
        metadata={"help": "Model type, e.g., bert."}
    )
    model_name_or_path: str = field(
        default="bert",
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models."}
    )
    do_lower_case: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether you want to do lower case on input before tokenization."}
    )
    model_cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where you want to store the pretrained models downloaded from s3."}
    )
    data_cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where you want to store the cached features for the task."}
    )


@dataclass
class NerDataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain data files for the task."}
    )
    labels: Optional[str] = field(
        default=None,
        metadata={"help": "Path to a file containing all labels for the task."},
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets."}
    )


@dataclass
class LMDataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    train_data_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a text file)."}
    )
    eval_data_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
    line_by_line: bool = field(
        default=False,
        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
    )

    mlm: bool = field(
        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
    )
    mlm_probability: float = field(
        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
    )
    block_size: int = field(
        default=-1,
        metadata={
            "help": "Optional input sequence length after tokenization."
            "The training dataset will be truncated in block of this size for training."
            "Default to the model max input length for single sentence inputs (take into account special tokens)."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )


def get_dataset(
    args: LMDataTrainingArguments,
    tokenizer: PreTrainedTokenizer,
    evaluate: bool = False,
    cache_dir: Optional[str] = None,
):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
            cache_dir=cache_dir,
        )


DATA_TRAINING_ARGUMENTS = {
    "text_classification": GlueDataTrainingArguments,
    "question_answering": SquadDataTrainingArguments,
    "sequence_labeling": NerDataTrainingArguments,
}


AUTO_MODEL = {
    "text_classification": AutoModelForSequenceClassification,
    "question_answering": AutoModelForQuestionAnswering,
    "sequence_labeling": AutoModelForTokenClassification,
}


DATASET = {
    "text_classification": GlueDataset,
    "question_answering": SquadDataset,
    "sequence_labeling": TokenClassificationDataset,
}


# some functions for fine-tuning BERT on a downstream target task
def do_target_task_finetuning(model_name_or_path, task_type, output_dir, **kwargs):
    r""" Fine-tuning BERT on a downstream target task.
    Params:
        **model_name_or_path**: either:
            - a string with the `shortcut name` of a pre-trained model configuration to load from cache
                or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
            - a path to a `directory` containing a configuration file saved
                using the `save_pretrained(save_directory)` method.
            - a path or url to a saved configuration `file`.
        **task_type**: string:
            The class of the task to train, selected in
            ["text_classification", "question_answering", "sequence_labeling"].
        **output_dir**: string:
            The output directory where the model predictions and checkpoints will be written.
        **kwargs**: (`optional`) dict:
            Dictionary of key/value pairs with which to update the configuration object after loading.
            - The values in kwargs of any keys which are configuration attributes will be used
            to override the loaded values.
    """
    # See all possible arguments in src/transformers/training_args.py

    assert task_type in DATA_TRAINING_ARGUMENTS
    model_args = ModelArguments(model_name_or_path=model_name_or_path)
    data_args_params = {}
    for param in ["task_name", "data_dir"]:
        if param in kwargs:
            data_args_params.update({param: kwargs[param]})

    data_args = DATA_TRAINING_ARGUMENTS[task_type](**data_args_params)
    training_args = TrainingArguments(output_dir=output_dir)

    # override the loaded configs
    configs = (model_args, data_args, training_args)
    for config in configs:
        for key, value in kwargs.items():
            if hasattr(config, key):
                setattr(config, key, value)

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            f"Use --overwrite_output_dir to overcome."
        )

    for p in [model_args.model_cache_dir, model_args.data_cache_dir, training_args.output_dir]:
        if not os.path.exists(p):
            os.makedirs(p)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    logger.info("Process device: %s, n_gpu: %s", training_args.device, training_args.n_gpu)
    logger.info("Training/evaluation parameters %s", training_args)


    # Set seed
    set_seed(training_args.seed)

    if task_type == "text_classification":
        try:
            data_args.task_name = data_args.task_name.lower()
            num_labels = glue_tasks_num_labels[data_args.task_name]
            output_mode = glue_output_modes[data_args.task_name]
        except KeyError:
            raise ValueError("Task not found: %s" % (data_args.task_name))
    elif task_type == "sequence_labeling":
        token_classification_task = NER() # You might want to this to Chunk() or POS()
        # if you are working with a Chunk or POS task, respectively
        labels = token_classification_task.get_labels(data_args.labels)
        label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
        num_labels = len(labels)

    # Load pretrained model and tokenizer

    AutoModel = AUTO_MODEL[task_type]
    auto_config_params = {
        'pretrained_model_name_or_path': model_args.model_name_or_path,
        'cache_dir': model_args.model_cache_dir,
    }

    if task_type == "text_classification":
        auto_config_params.update({
            "num_labels": num_labels,
            "finetuning_task": data_args.task_name,
        })
    elif task_type == "sequence_labeling":
        auto_config_params.update({
            "num_labels": num_labels,
            "id2label": label_map,
            "label2id": {label: i for i, label in enumerate(labels)},
        })

    config = AutoConfig.from_pretrained(**auto_config_params)

    auto_tokenizer_params = {
        "pretrained_model_name_or_path": model_args.model_name_or_path,
        "cache_dir": model_args.model_cache_dir,
        "do_lower_case": model_args.do_lower_case,
    }
    tokenizer = AutoTokenizer.from_pretrained(**auto_tokenizer_params)

    auto_model_params = {
        "pretrained_model_name_or_path": model_args.model_name_or_path,
        "from_tf": False,
        "config": config,
        "cache_dir": model_args.model_cache_dir,
    }

    if "model_load_mode" in kwargs and kwargs["model_load_mode"] == "base_model_only":
        WEIGHTS_NAME = "pytorch_model.bin"
        archive_file = os.path.join(model_args.model_name_or_path, WEIGHTS_NAME)
        # Use torch.load with map_location=torch.device() to map the pretrained model to our device.
        model_state_dict = torch.load(archive_file, map_location=torch.device(training_args.device))
        
        state_dict_with_prefix = {}
        for key, value in model_state_dict.items():
            if key.startswith(model_args.model_type):
                state_dict_with_prefix[key] = value

        auto_model_params.update({"state_dict": state_dict_with_prefix})
        
    model = AutoModel.from_pretrained(**auto_model_params)

    # Get datasets
    Dataset = DATASET[task_type]
    dataset_params = {
        "tokenizer": tokenizer,
    }
    if task_type == "sequence_labeling":
        dataset_params.update({
            "token_classification_task": token_classification_task,
            "data_dir": data_args.data_dir,
            "labels": labels,
            "model_type": model_args.model_type,
            "max_seq_length": data_args.max_seq_length
        })

    else:
        dataset_params.update({
            "args": data_args,
            "cache_dir": model_args.data_cache_dir,
        })

    train_dataset = (Dataset(**dataset_params) if training_args.do_train else None)

    dataset_params.update({"mode": Split.dev if task_type == "sequence_labeling" else "dev"})
    eval_dataset = (Dataset(**dataset_params) if training_args.do_eval else None)

    # Initialize our Trainer
    trainer_params = {
        "model": model,
        "args": training_args,
        "train_dataset": train_dataset,
        "eval_dataset": eval_dataset,
    }
    trainer = Trainer(**trainer_params)

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory
        tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        if task_type == "text_classification":
            def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:
                def compute_metrics_fn(p: EvalPrediction):
                    if output_mode == "classification":
                        preds = np.argmax(p.predictions, axis=1)
                    elif output_mode == "regression":
                        preds = np.squeeze(p.predictions)
                    return glue_compute_metrics(task_name, preds, p.label_ids)
                return compute_metrics_fn

            logger.info("*** Evaluate ***")
            # Loop to handle MNLI double evaluation (matched, mis-matched)
            eval_datasets = [eval_dataset]
            if data_args.task_name == "mnli":
                mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm")
                eval_datasets.append(
                    Dataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.data_cache_dir)
                )

            for eval_dataset in eval_datasets:
                trainer.compute_metrics = build_compute_metrics_fn(eval_dataset.args.task_name)
                eval_result = trainer.evaluate(eval_dataset=eval_dataset)

                output_eval_file = os.path.join(training_args.output_dir, f"eval_results.txt")
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results *****")
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

                eval_results.update(eval_result)

        elif task_type == "question_answering":
            # We don't use trainer.evaluate here since it currently does not support question answering tasks
            # (https://github.com/huggingface/transformers/issues/7032)
            model = AutoModel.from_pretrained(model_args.model_cache_dir)
            tokenizer = AutoTokenizer.from_pretrained(model_args.model_cache_dir, do_lower_case=model_args.do_lower_case)
            model.to(training_args.device)


            dataset = eval_dataset.dataset
            examples = eval_dataset.examples
            features = eval_dataset.features
            eval_batch_size = training_args.per_gpu_eval_batch_size * max(1, training_args.n_gpu)

            eval_sampler = SequentialSampler(dataset)
            eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size)

            logger.info("*** Evaluate ***")
            description = "Evaluation"
            logger.info("***** Running %s *****", description)
            logger.info("  Num examples = %d", len(dataset))
            logger.info("  Batch size = %d", eval_batch_size)

            all_results = []
            start_time = timeit.default_timer()

            for batch in tqdm(eval_dataloader, desc=description):
                model.eval()
                batch = tuple(t.to(training_args.device) for t in batch)

                with torch.no_grad():
                    inputs = {
                        "input_ids": batch[0],
                        "attention_mask": batch[1],
                        "token_type_ids": batch[2],
                    }
                    feature_indices = batch[3]
                    outputs = model(**inputs)

                for i, feature_index in enumerate(feature_indices):
                    eval_feature = features[feature_index.item()]
                    unique_id = int(eval_feature.unique_id)
                    output = [output[i].detach().cpu().tolist() for output in outputs]
                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)
                    all_results.append(result)

            evalTime = timeit.default_timer() - start_time
            logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

            # Compute predictions
            output_prediction_file = os.path.join(training_args.output_dir, "predictions.json")
            output_nbest_file = os.path.join(training_args.output_dir, "nbest_predictions.json")

            output_null_log_odds_file = os.path.join(training_args.output_dir, "null_odds.json") \
                if data_args.version_2_with_negative else None

            predictions = compute_predictions_logits(
                all_examples=examples,
                all_features=features,
                all_results=all_results,
                n_best_size=data_args.n_best_size,
                max_answer_length=data_args.max_answer_length,
                do_lower_case=model_args.do_lower_case,
                output_prediction_file=output_prediction_file,
                output_nbest_file=output_nbest_file,
                output_null_log_odds_file=output_null_log_odds_file,
                verbose_logging=False,
                version_2_with_negative=data_args.version_2_with_negative,
                null_score_diff_threshold=data_args.null_score_diff_threshold,
                tokenizer=tokenizer,
            )

            # Compute the F1 and exact scores.
            eval_result = squad_evaluate(examples, predictions)

            output_eval_file = os.path.join(training_args.output_dir, f"eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in eval_result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)


        elif task_type == "sequence_labeling":
            def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
                preds = np.argmax(predictions, axis=2)
                batch_size, seq_len = preds.shape
                label_list = [[] for _ in range(batch_size)]
                pred_list = [[] for _ in range(batch_size)]

                for i in range(batch_size):
                    for j in range(seq_len):
                        if label_ids[i, j] != torch.nn.CrossEntropyLoss().ignore_index:
                            label_list[i].append(label_map[label_ids[i][j]])
                            pred_list[i].append(label_map[preds[i][j]])
                return pred_list, label_list

            def compute_metrics_fn(p: EvalPrediction) -> Dict:
                pred_list, label_list = align_predictions(p.predictions, p.label_ids)
                return {
                    "accuracy_score": accuracy_score(label_list, pred_list),
                    "precision": precision_score(label_list, pred_list),
                    "recall": recall_score(label_list, pred_list),
                    "f1": f1_score(label_list, pred_list),
                }

            trainer.compute_metrics = compute_metrics_fn
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(training_args.output_dir, f"eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in eval_result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

        else:
            raise ValueError("Invalid task type.")
    return eval_results


print('setup complete')

setup complete


In [4]:
data_file = drive.CreateFile({'id': '19cnGSN88KlRJRcIqwxw3C4ylJftdkZ2W'})
data_file.GetContentFile('bert-base-cased-finetuned-squad.zip')

# Extract the data from the zipfile and put it into pretrained_models_dir
with zipfile.ZipFile('bert-base-cased-finetuned-squad.zip', 'r') as zip_file:
    zip_file.extractall(pretrained_models_dir)
os.remove('bert-base-cased-finetuned-squad.zip')
print("bert-base-cased-finetuned-squad downloaded!")

bert-base-cased-finetuned-squad downloaded!


In [5]:
import torch.nn as nn
from torch import sigmoid
import torch.nn.functional as F
import numpy as np

from sklearn.metrics import accuracy_score

def train(model, x, y, optimizer, criterion): 
    model.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    return loss, output

def eval(model, x, y=None, criterion=None):
    output = model(x)
    loss = 0
    if y is not None:
        loss = criterion(output, y)
    return loss, output

class Network2(nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout_layer = nn.Dropout(p=0.1) 
        self.fully_connected = nn.Linear(768, 1)      

    def forward(self,x):
        x = self.dropout_layer(x)
        x = self.fully_connected(x)
        x = sigmoid(x) 
        return x

    def predict_label(self,pred):
        ans = []
        for t in pred:
            ans.append(np.round(t.detach()))
        return torch.tensor(ans)

In [6]:
from torch.optim import Adam

net_model = Network2()
criterion = nn.BCELoss()
optm = Adam(net_model.parameters(), lr = 5e-4)

net_model = torch.load("/content/classifier_BERT.pt")

In [7]:
from transformers import BertTokenizerFast, AutoModel

model_name_or_path = "bert-base-uncased"
cache_dir = os.path.join(pretrained_models_dir, model_name_or_path)
tokenizer = BertTokenizerFast.from_pretrained(model_name_or_path, add_special_tokens=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_Bert = AutoModel.from_pretrained(model_name_or_path, cache_dir=cache_dir)
model_Bert = model_Bert.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [8]:
def convert_into_words (sent) :  
  word_with_offset = []
  start = end = 0
  w = ''
  for i in range(len(sent)):
      if sent[i] == ' ':
          if(w != '') :
            word_with_offset.append([start,end,w])
          start = i + 1
          w = ''
      else:
          w = w + sent[i]
      end = end + 1

  # last word always ends without space
  if(w != '') :
    word_with_offset.append([start,end,w])

  return word_with_offset


def getEmbeddings (sent) :
  # Encode the input sentence and get the model's output
  input = tokenizer.encode(sent, return_tensors="pt",add_special_tokens=False).to(device)

  num_tokens = len(input[0])
  
  if num_tokens >= 512 :
    return None, True

  temp = tokenizer.encode_plus(sent,return_offsets_mapping=True, add_special_tokens=False, return_tensors="pt").to(device)
  input_offset = temp['offset_mapping'][0]
  input_tokens = tokenizer.tokenize(sent,return_tensors="pt")

  # The model outputs the masked language modeling logits of shape 
  # [batch_size, sequence_length, vocab_size] 
  token_logits = model_Bert(input)[0]

  token_with_offset = []  
  for i in range(len(input_offset)) :
    token_with_offset.append( [input_offset[i][0], input_offset[i][1], input_tokens[i]] )

  tind = 0
  output_emb = []

  word_with_offset = convert_into_words(sent)

  for v in word_with_offset :
    cur_word = v[2]
    flag = True
    while(flag) :
      if(v[0] == token_with_offset[tind][0]) :
        #print(cur_word,token_with_offset[tind])
        output_emb.append([cur_word,tind])
        flag = False
      tind += 1

  ret_word_embedding = []

  for token_pair in output_emb :
    word = token_pair[0]
    word_embedding = token_logits[0][token_pair[1]]
    # word_embedding = word_embedding.reshape([1, word_embedding.shape[0]])
    ret_word_embedding.append([word, word_embedding])

  return ret_word_embedding , False

In [9]:
cur_sentence = "[CLS] what is throat cancer ? [SEP] is it treatable ?"
em = getEmbeddings(cur_sentence)
em = em[0]
for e in em:
    net_model = net_model.eval()        
    loss, output = eval(net_model, e[1])
    y_pred = net_model.predict_label(output.cpu())
    print(e[0],y_pred)

[CLS] tensor([0.])
what tensor([0.])
is tensor([0.])
throat tensor([0.])
cancer tensor([1.])
? tensor([0.])
[SEP] tensor([0.])
is tensor([0.])
it tensor([0.])
treatable tensor([0.])
? tensor([0.])


In [10]:
def getRelevantWords(full=None, history = None, cur=None): # either give full sentence, or give cur question + history
    if full is None:
        full = "[CLS] " + history + " [SEP] " + cur
    else:
        full = "[CLS] " + full

    all_words = full.split(' ')
    em = getEmbeddings(full)
    em = em[0]
    em = em[1:]
    
    rel_words = []
    rel_phrase = ""
    for e in em:     
      loss, output = eval(net_model, e[1])
      y_pred = net_model.predict_label(output.cpu())      
      if(y_pred == 1) :
        rel_words.append(e[0])
        rel_phrase += " "+e[0]
      if(e[0] == "[SEP]") :
        break  

    return rel_words, rel_phrase

In [11]:
cur_sentence = "what is throat cancer ? [SEP] is it treatable ?"
temp,temp_phrase = getRelevantWords(cur_sentence)
print(temp,temp_phrase)

['cancer']  cancer


In [17]:
with open("/content/coref_answers_10.txt") as f :
  passages_list = f.readlines()
passages_list = [x.strip() for  x in passages_list]

question_passage_map = {}
for p in passages_list:
    parts = p.split('\t')
    if parts[0] not in question_passage_map:
        question_passage_map[parts[0]] = [' '.join(parts[1:])]
    else:
        question_passage_map[parts[0]].append(' '.join(parts[1:]))

print(question_passage_map['76_2'])

['Traditional Differences Between Genre and Literary Fiction. The traditional differences between genre and literary fiction are outlined in the table below. Keep in mind these are broad-brush differences that in some cases focus on the extremes of the categories and that disagreements abound. Many genre books have elements of literary fiction and vice versa and there is increasingly a third category of crossover fiction that will be discussed below.', 'Literary Devices, Elements, Techniques, and Terms. A literary devise is any tool used in literature to help the reader understand the story and its character(s). There are two types of literary devises used by authors, literary elements and literary techniques. Literary elements. provide structure to the literature. All literary elements are evident in all literature that creates a story. Literary techniques. are specific to each author.', 'Bourdieu\'s principle of habitus is interwoven with the concept of structuralism in literary theo

In [18]:
coref_questions = []
with open("/content/evaluation_topics_annotated_resolved_v1.0.tsv") as f :
  temp = f.readlines()

question_ids = []
for t in temp :
  te = t.split('\t')
  coref_questions.append(te[1].strip())
  question_ids.append(te[0].strip())

print(len(coref_questions))
print(coref_questions)

print(len(question_ids))
print(question_ids)

479
['What is throat cancer?', 'Is throat cancer treatable?', 'Tell me about lung cancer.', "What are lung cancer's symptoms?", 'Can lung cancer spread to the throat?', 'What causes throat cancer?', 'What is the first sign of throat cancer?', 'Is throat cancer the same as esophageal cancer?', "What's the difference in throat cancer and esophageal cancer's symptoms?", 'What are the different types of sharks?', 'Are sharks endangered? If so, which species?', 'Tell me more about tiger sharks.', 'What is the largest shark ever to have lived on Earth?', "What's the biggest shark ever caught?", 'What about for great whites?', 'Tell me about Mako sharks.', 'What are Mako shark adaptations?', 'Where do Mako sharks live?', 'What do Mako sharks eat?', 'How do Mako sharks compare with Tiger sharks for being dangerous?', 'Tell me about the Neverending Story film.', 'What is the Neverending Story film about?', 'How was the Neverending Story film received?', 'Did the Neverending Story film win any a

In [23]:
task_name = "SQuAD"
model_name_or_path = "bert-base-cased-finetuned-squad"
pretrained_weights = os.path.join(pretrained_models_dir, model_name_or_path)
task_type = "question_answering"
model = AUTO_MODEL[task_type].from_pretrained(pretrained_weights)
tokenizer = AutoTokenizer.from_pretrained(pretrained_weights)

accepted_answers = []

for i in range(len(coref_questions)):
    question = coref_questions[i]
    context = question_passage_map[question_ids[i]]
    print(len(context))

    flag_found = False
    for j in range(len(context)):
        inputs = tokenizer.encode_plus(question, context[j], add_special_tokens=True, return_tensors="pt", max_length = 512, truncation=True)
        input_ids = inputs["input_ids"].tolist()[0]   
        output = model(**inputs) 
            
        start_logits = output['start_logits']
        end_logits = output['end_logits']   

        max_start , start_ind = torch.topk(start_logits,1)
        max_end , end_ind = torch.topk(end_logits,1)

        output_ids = input_ids[start_ind:end_ind+1]
        output_tokens = tokenizer.convert_ids_to_tokens(output_ids)
        output_string = tokenizer.convert_tokens_to_string(output_tokens)
        
        print(question)
        print(output_string)
        print()

        if output_string != '[CLS]' and output_string != '':
            accepted_answers.append(output_string)
            flag_found = True
            break

    if not flag_found:
        accepted_answers.append('')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
What is a DC half smoke?
[CLS]

What is a DC half smoke?
[CLS]

10
Tell me about DC half smoke's history.
[CLS]

Tell me about DC half smoke's history.
[CLS]

Tell me about DC half smoke's history.
[CLS]

Tell me about DC half smoke's history.
[CLS]

Tell me about DC half smoke's history.
[CLS]

Tell me about DC half smoke's history.
[CLS]

Tell me about DC half smoke's history.
[CLS]

Tell me about DC half smoke's history.
[CLS]

Tell me about DC half smoke's history.
[CLS]

Tell me about DC half smoke's history.
[CLS]

10
Describe the oceanic crust.
[CLS]

Describe the oceanic crust.
[CLS]

Describe the oceanic crust.
[CLS]

Describe the oceanic crust.
[CLS]

Describe the oceanic crust.
[CLS]

Describe the oceanic crust.
[CLS]

Describe the oceanic crust.
[CLS]

Describe the oceanic crust.
[CLS]

Describe the oceanic crust.
[CLS]

Describe the oceanic crust.
[CLS]

10
What are the main layers in the oceanic crust?
crust

In [24]:
print(question_ids)

['31_1', '31_2', '31_3', '31_4', '31_5', '31_6', '31_7', '31_8', '31_9', '32_1', '32_2', '32_3', '32_4', '32_5', '32_6', '32_7', '32_8', '32_9', '32_10', '32_11', '33_1', '33_2', '33_3', '33_4', '33_5', '33_6', '33_7', '33_8', '33_9', '33_10', '34_1', '34_2', '34_3', '34_4', '34_5', '34_6', '34_7', '34_8', '34_9', '35_1', '35_2', '35_3', '35_4', '35_5', '35_6', '35_7', '35_8', '35_9', '36_1', '36_2', '36_3', '36_4', '36_5', '36_6', '36_7', '36_8', '36_9', '36_10', '36_11', '37_1', '37_2', '37_3', '37_4', '37_5', '37_6', '37_7', '37_8', '37_9', '37_10', '37_11', '37_12', '38_1', '38_2', '38_3', '38_4', '38_5', '38_6', '38_7', '38_8', '39_1', '39_2', '39_3', '39_4', '39_5', '39_6', '39_7', '39_8', '39_9', '40_1', '40_2', '40_3', '40_4', '40_5', '40_6', '40_7', '40_8', '40_9', '40_10', '41_1', '41_2', '41_3', '41_4', '41_5', '41_6', '41_7', '41_8', '41_9', '42_1', '42_2', '42_3', '42_4', '42_5', '42_6', '42_7', '42_8', '43_1', '43_2', '43_3', '43_4', '43_5', '43_6', '43_7', '43_8', '44_1'

In [25]:
print(len(accepted_answers))

479


In [26]:
with open('/content/accepted_answers.txt', 'w') as f:
    for a in accepted_answers:
        f.write(a + '\n')

In [27]:
with open('/content/accepted_answers_filtered.txt') as f:
    accepted_answers_filtered = f.readlines()

In [28]:
# Generate QA List
QA_answer_list = []
for t in accepted_answers_filtered :
    temp = t.strip()
    temp_split = temp.split(" ")
    if(len(temp_split) > 10):
        temp_split = temp_split[:10]
    temp_join = " ".join(temp_split)
    QA_answer_list.append(temp_join)

In [29]:
print(len(QA_answer_list))
print(QA_answer_list)

479
['any cancer that forms in the throat', '', '', '', '', '', 'severe throat pain', 'a subset', 'dysphagia', '', '', '', 'blue whale', 'carcharodon megalodon', '', '', 'they are agile hunters ; built for the speed and', '', 'large marine fish', 'galeocerdo', '', '', 'it was released in 1984', '', '', "michael ende's novel the neverending story ( 1979", 'synthpop', 'bastian bux', '', 'died later that year.', '', 'placenames and personal names', '', 'europoids of the afanasevo culture', 'the myth of the ages follows. in the hesiodic scheme', 'wanda and horowitz separated in 1948 while the pianist was', 'environmental causes', 'sumerian, akkadian, babylonian and assyrian', 'the bronze age', '', 'french toile', '', '', '', 'late 19th century', '', 'in her review of the concert at the plymouth pavilions,', '', '', 'the people in each state vote for the president', 'us electoral college', 'according to a legal expert, one example of why the', 'changed the method that determines how the us 

In [31]:
import json
jsonFile = open('/content/evaluation_topics_v1.0.json', 'r')
values = json.load(jsonFile)
sentences = []
for v in values:
    turns = v['turn']
    cur = []
    for t in turns:
        s = t['raw_utterance']
        cur.append(s)
    sentences.append(cur)
jsonFile.close()
print(sentences)

[['What is throat cancer?', 'Is it treatable?', 'Tell me about lung cancer.', 'What are its symptoms? ', 'Can it spread to the throat?', 'What causes throat cancer?', 'What is the first sign of it?', 'Is it the same as esophageal cancer?', "What's the difference in their symptoms?"], ['What are the different types of sharks?', 'Are sharks endangered?  If so, which species?', 'Tell me more about tiger sharks.', 'What is the largest ever to have lived on Earth?', "What's the biggest ever caught?", 'What about for great whites?', 'Tell me about makos.', 'What are their adaptations?', 'Where do they live?', 'What do they eat?', 'How do they compare with tigers for being dangerous?'], ['Tell me about the Neverending Story film.', 'What is it about?', 'How was it received?', 'Did it win any awards?', 'Was it a book first?', 'Who was the author and when what it published?', 'What are the main themes?', 'Who are the main characters?', 'What are the differences between the book and movies?', 'D

In [32]:
QA_updated_questions = []
index = 0

for question_set in sentences:
  history = ""
  cur_set = []
  for question in question_set :
    rel_list, rel_phrase = getRelevantWords(None, history, question)
    history += question + " " + QA_answer_list[index] + " "
    index += 1
    # print(history)
    print(question + rel_phrase)
    cur_set.append(question + rel_phrase)
  QA_updated_questions.append(cur_set)

  print("------------------------------------")

What is throat cancer?
Is it treatable?
Tell me about lung cancer.
What are its symptoms? 
Can it spread to the throat?
What causes throat cancer?
What is the first sign of it?
Is it the same as esophageal cancer?
What's the difference in their symptoms?
------------------------------------
What are the different types of sharks?
Are sharks endangered?  If so, which species? sharks?
Tell me more about tiger sharks.
What is the largest ever to have lived on Earth?
What's the biggest ever caught?
What about for great whites?
Tell me about makos.
What are their adaptations?
Where do they live?
What do they eat?
How do they compare with tigers for being dangerous?
------------------------------------
Tell me about the Neverending Story film.
What is it about?
How was it received?
Did it win any awards?
Was it a book first?
Who was the author and when what it published?
What are the main themes?
Who are the main characters?
What are the differences between the book and movies?
Did the horse

In [33]:
n = 31
i = 1
with open('/content/qa_updated.txt', 'a') as f:
    for cur in QA_updated_questions:
        i = 1
        for l in cur:
            line = '{"number" : "' + str(n) + '_' + str(i) + '", "text" : "#combine(' + l + ')"},\n'
            f.write(line)
            i = i + 1
        n = n + 1