In [3]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")

Found device: Tesla T4, n_gpu: 1


In [1]:
!pip install transformers
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

data_file = drive.CreateFile({'id': '1zeo8FcaNUnhN660mGMNEAPvxOE4DPOnE'})
data_file.GetContentFile('hw1.zip')

# Extract data from the zipfile and put it into the current directory
with zipfile.ZipFile('hw1.zip', 'r') as zip_file:
    zip_file.extractall('./')
os.remove('hw1.zip')
# We will use hw1 as our working directory
os.chdir('hw1')
print("Data and supporting code downloaded!")

pretrained_models_dir = './pretrained_models_dir'
if not os.path.isdir(pretrained_models_dir):
  os.mkdir(pretrained_models_dir)   # directory to save pretrained models
print('model directory created')

!pip install -r requirements.txt
print('everything set up!')

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 8.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 25.1MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 43.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=1ded6933414c5

In [4]:
from transformers import BertTokenizerFast, AutoModel

model_name_or_path = "bert-base-uncased"
cache_dir = os.path.join(pretrained_models_dir, model_name_or_path)
tokenizer = BertTokenizerFast.from_pretrained(model_name_or_path, add_special_tokens=False)
model = AutoModel.from_pretrained(model_name_or_path, cache_dir=cache_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print('success!')

success!


In [6]:
'''
The data is loaded from sentences.txt and labels.txt
eg.
sentences.txt (1 line) : 
where is malayali located ? 30,803,747 speakers of malayalam in kerala , making up 93.2 % of the total number of malayalam speakers in india . [SEP] what other languages are spoken there ?
labels.txt (1 line) :
O O REL O O O O O REL O O O O O O O O O O O O REL O O O O [SEP] O O O O O O O
'''

with open("/content/sentences.txt") as f:
    content = f.readlines()
content = ["[CLS] " + x.strip() for  x in content] 

with open("/content/labels.txt") as f:
    labels_text = f.readlines()

# Stores a list of relevance labels for all sentences in the training set
labels = []

for temp in labels_text :
    rel_vals = temp.split(' ') 
    label_temp = []
    for token_val in rel_vals :
      if(token_val=="O") :
        label_temp.append(0)
      elif(token_val=="REL") :
        label_temp.append(1)
      elif(token_val=="[SEP]") :
        break
      else :
        label_temp.append(-1)
        print("Error")

    labels.append(label_temp)

In [7]:
def convert_into_words (sent) :  
  word_with_offset = []
  start = end = 0
  w = ''
  for i in range(len(sent)):
      if sent[i] == ' ':
          if(w != '') :
            word_with_offset.append([start,end,w])
          start = i + 1
          w = ''
      else:
          w = w + sent[i]
      end = end + 1

  # last word always ends without space
  if(w != '') :
    word_with_offset.append([start,end,w])

  return word_with_offset


def getEmbeddings (sent) :
  # Encode the input sentence and get the model's output
  input = tokenizer.encode(sent, return_tensors="pt",add_special_tokens=False).to(device)

  num_tokens = len(input[0])
  
  if num_tokens >= 512 :
    return None, True

  temp = tokenizer.encode_plus(sent,return_offsets_mapping=True, add_special_tokens=False, return_tensors="pt").to(device)
  input_offset = temp['offset_mapping'][0]
  input_tokens = tokenizer.tokenize(sent,return_tensors="pt")

  # The model outputs the masked language modeling logits of shape 
  # [batch_size, sequence_length, vocab_size] 
  token_logits = model(input)[0]

  token_with_offset = []  
  for i in range(len(input_offset)) :
    token_with_offset.append( [input_offset[i][0], input_offset[i][1], input_tokens[i]] )

  tind = 0
  output_emb = []

  word_with_offset = convert_into_words(sent)

  for v in word_with_offset :
    cur_word = v[2]
    flag = True
    while(flag) :
      if(v[0] == token_with_offset[tind][0]) :
        #print(cur_word,token_with_offset[tind])
        output_emb.append([cur_word,tind])
        flag = False
      tind += 1

  ret_word_embedding = []

  for token_pair in output_emb :
    word = token_pair[0]
    word_embedding = token_logits[0][token_pair[1]]
    # word_embedding = word_embedding.reshape([1, word_embedding.shape[0]])
    ret_word_embedding.append([word, word_embedding])

  return ret_word_embedding , False

In [8]:
'''
This cell is used to create the dataset as form of (BERT encoding , label) pairs
that will be used to train the classifier
'''

index = 0
train_embed = []
train_labels = []

for sentence in content:
  with torch.no_grad():
      embed, is_exceed = getEmbeddings(sentence)
      
      if(not is_exceed) :
        for temp in embed :
          word = temp[0]
          word_emb = temp[1]
          if(word == "[CLS]"):
            continue
          elif(word == "[SEP]"):
            break
          else :
            train_embed.append(word_emb)
        train_labels.extend(labels[index])

      if(is_exceed) :
        print(index , embed)

      index += 1
      if(index % 1000==0):
        print(index)

      torch.cuda.empty_cache()

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
16799 None
17000
18000
19000
20000


In [9]:
train_embed = torch.stack(train_embed, dim=0)
train_embed = train_embed.to(device)
train_labels = torch.tensor(train_labels)
print(train_embed.shape)
print(train_labels.shape)
#print(train_embed[0])

torch.Size([1805293, 768])
torch.Size([1805293])


In [10]:
train_frac = 0.7
valid_frac = 0.2
test_frac = 1 - train_frac - valid_frac

train_split = int(train_frac * len(train_labels))
valid_split = int(valid_frac * len(train_labels))
test_split = int(test_frac * len(train_labels))

index = 0
train_start = index
index += train_split
train_end = index

valid_start = index 
index += valid_split
valid_end = index

test_start = index
index += test_split
test_end = index

print(train_start, train_end)
print(valid_start, valid_end)
print(test_start, test_end)

0 1263705
1263705 1624763
1624763 1805292


In [11]:
x_train = train_embed[train_start : train_end]
y_train = train_labels[train_start : train_end]

x_valid = train_embed[valid_start : valid_end]
y_valid = train_labels[valid_start : valid_end]

x_test = train_embed[test_start : test_end]
y_test = train_labels[test_start : test_end]

x_train = x_train.to(device)
y_train = y_train.to(device)

x_valid = x_valid.to(device)
y_valid = y_valid.to(device)

x_test = x_test.to(device)
y_test = y_test.to(device)

print(x_train.shape, y_train.shape, x_valid.shape, y_valid.shape, x_test.shape, y_test.shape)

torch.Size([1263705, 768]) torch.Size([1263705]) torch.Size([361058, 768]) torch.Size([361058]) torch.Size([180529, 768]) torch.Size([180529])


In [12]:
del train_embed
torch.cuda.empty_cache()

In [13]:
from torch.utils.data import Dataset

class WordDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, data_tensor, label_tensor):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data = data_tensor
        self.labels = label_tensor

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        item = {'data': self.data[idx], 'label': self.labels[idx]}
        return item

In [14]:
train_ds = WordDataset(x_train, y_train)
valid_ds = WordDataset(x_valid, y_valid)
test_ds = WordDataset(x_test, y_test)

In [15]:
BATCH_SIZE = 3072

train_dl = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE)

In [16]:
import torch.nn as nn
from torch import sigmoid
import torch.nn.functional as F
import numpy as np

from sklearn.metrics import accuracy_score

def train(model, x, y, optimizer, criterion): 
    model.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    return loss, output

def eval(model, x, y=None, criterion=None):
    output = model(x)
    loss = 0
    if y is not None:
        loss = criterion(output, y)
    return loss, output

class Network2(nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout_layer = nn.Dropout(p=0.1) 
        self.fully_connected = nn.Linear(768, 1)      

    def forward(self,x):
        x = self.dropout_layer(x)
        x = self.fully_connected(x)
        x = sigmoid(x) 
        return x

    def predict_label(self,pred):
        ans = []
        for t in pred:
            ans.append(np.round(t.detach()))
        return torch.tensor(ans)

In [17]:
from torch.optim import Adam

net2 = Network2()
criterion = nn.BCELoss()
optm = Adam(net2.parameters(), lr = 5e-4)
net2 = net2.to(device)

In [18]:
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

net2 = net2.to(device)

epochs = 5
for epoch in range(epochs):
    net2 = net2.train()

    for bidx, batch in enumerate(train_dl):
        x, y = batch['data'], batch['label']
        y = y.float()
        y = y.reshape((y.shape[0], 1))
        loss, output = train(net2, x, y, optm, criterion)
        y_pred = net2.predict_label(output.cpu())
        y_true = y.cpu()
        acc = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        if bidx % 200 == 0:
            print("TRAIN : Iteration: " + str(bidx) + \
                  ", F1-Score: " + str(f1) + \
                  ", Accuracy: " + str(acc) + \
                  ", Loss: " + str(loss.cpu().item()))

    net2 = net2.eval()
        
    y_valid = y_valid.float()
    y_valid = y_valid.reshape((y_valid.shape[0], 1))

    loss, output = eval(net2, x_valid, y_valid, criterion)

    y_true = y_valid.cpu()
    y_pred = net2.predict_label(output.cpu())

    acc_total = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, normalize='pred')

    print("\nVALID : Epoch: " + str(epoch) + \
          ", Accuracy_0: " + str(cm[0][0]) + \
          ", Accuracy_1: " + str(cm[1][1]) + \
            ", F1-Score: " + str(f1) + \
            ", Accuracy: " + str(acc) + \
            ", Loss: " + str(loss.cpu().item()))
    print("======================================================================")

TRAIN : Iteration: 0, F1-Score: 0.11595221363316936, Accuracy: 0.18098958333333334, Loss: 0.8398141860961914
TRAIN : Iteration: 200, F1-Score: 0.0, Accuracy: 0.9599609375, Loss: 0.15133829414844513
TRAIN : Iteration: 400, F1-Score: 0.046875, Accuracy: 0.9602864583333334, Loss: 0.12214729189872742

VALID : Epoch: 0, Accuracy_0: 0.9569074423177656, Accuracy_1: 0.7275190933727519, F1-Score: 0.2637078049651724, Accuracy: 0.945193171608266, Loss: 0.12942616641521454
TRAIN : Iteration: 0, F1-Score: 0.12195121951219513, Accuracy: 0.9296875, Loss: 0.18225789070129395
TRAIN : Iteration: 200, F1-Score: 0.25477707006369427, Accuracy: 0.9619140625, Loss: 0.1186826229095459
TRAIN : Iteration: 400, F1-Score: 0.2709677419354839, Accuracy: 0.9632161458333334, Loss: 0.11184024065732956

VALID : Epoch: 1, Accuracy_0: 0.9649516026658204, Accuracy_1: 0.7326295114166462, F1-Score: 0.4507042253521127, Accuracy: 0.9389038634321654, Loss: 0.11706885695457458
TRAIN : Iteration: 0, F1-Score: 0.1804511278195489,

In [19]:
net2 = net2.eval()
        
y_test = y_test.float()
y_test = y_test.reshape((y_test.shape[0], 1))

loss, output = eval(net2, x_test, y_test, criterion)

y_true = y_test.cpu()
y_pred = net2.predict_label(output.cpu())

acc_total = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred, normalize='pred')

print("\nVALID : Epoch: " + str(epoch) + \
        ", Accuracy_0: " + str(cm[0][0]) + \
        ", Accuracy_1: " + str(cm[1][1]) + \
        ", F1-Score: " + str(f1) + \
        ", Accuracy: " + str(acc) + \
        ", Loss: " + str(loss.cpu().item()))


VALID : Epoch: 4, Accuracy_0: 0.9656686241324868, Accuracy_1: 0.7499438328465513, F1-Score: 0.48257915281191266, Accuracy: 0.9407008086253369, Loss: 0.11536696553230286


In [20]:
torch.save(net2, "/content/classifier_BERT.pt")

In [21]:
from torch.optim import Adam

net_model = Network2()
criterion = nn.BCELoss()
optm = Adam(net_model.parameters(), lr = 5e-4)

net_model = torch.load("/content/classifier_BERT.pt")

In [22]:
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

net_model = net_model.eval()
        
y_test = y_test.float()
y_test = y_test.reshape((y_test.shape[0], 1))

loss, output = eval(net_model, x_test, y_test, criterion)

y_true = y_test.cpu()
y_pred = net_model.predict_label(output.cpu())

acc_total = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred, normalize='pred')

print( " Accuracy_0: " + str(cm[0][0]) + \
        ", Accuracy_1: " + str(cm[1][1]) + \
        ", F1-Score: " + str(f1) + \
        ", Accuracy: " + str(acc_total) + \
        ", Loss: " + str(loss.cpu().item()))

 Accuracy_0: 0.9656686241324868, Accuracy_1: 0.7499438328465513, F1-Score: 0.48257915281191266, Accuracy: 0.9603498606872026, Loss: 0.11536696553230286


In [23]:
cur_sentence = "[CLS] what is throat cancer ? [SEP] is it treatable ?"
em = getEmbeddings(cur_sentence)
em = em[0]
for e in em:
    net_model = net_model.eval()        
    loss, output = eval(net_model, e[1])
    y_pred = net_model.predict_label(output.cpu())
    print(e[0],y_pred)

[CLS] tensor([0.])
what tensor([0.])
is tensor([0.])
throat tensor([0.])
cancer tensor([1.])
? tensor([0.])
[SEP] tensor([0.])
is tensor([0.])
it tensor([0.])
treatable tensor([0.])
? tensor([0.])


In [24]:
def getRelevantWords(full=None, history = None, cur=None): # either give full sentence, or give cur question + history
    if full is None:
        full = "[CLS] " + history + " [SEP] " + cur
    else:
        full = "[CLS] " + full

    all_words = full.split(' ')
    em = getEmbeddings(full)
    em = em[0]
    em = em[1:]
    
    rel_words = []
    rel_phrase = ""
    for e in em:     
      loss, output = eval(net_model, e[1])
      y_pred = net_model.predict_label(output.cpu())      
      if(y_pred == 1) :
        rel_words.append(e[0])
        rel_phrase += " "+e[0]
      if(e[0] == "[SEP]") :
        break  

    return rel_words,rel_phrase
   

In [25]:
cur_sentence = "what is throat cancer ? [SEP] is it treatable ?"
temp,temp_phrase = getRelevantWords(cur_sentence)
print(temp,temp_phrase)

['cancer']  cancer


In [26]:
with open("/content/sentences.txt") as f:
    test_sents = f.readlines()
test_sents = [x.strip() for  x in test_sents]

print(test_sents[0])

where is malayali located ? 30,803,747 speakers of malayalam in kerala , making up 93.2 % of the total number of malayalam speakers in india . [SEP] what other languages are spoken there ?


In [27]:
all_sentences = test_sents[0:10]
for s in all_sentences:
    rel_words, rel_phrase = getRelevantWords(s)
    print(s)
    print(rel_words)

where is malayali located ? 30,803,747 speakers of malayalam in kerala , making up 93.2 % of the total number of malayalam speakers in india . [SEP] what other languages are spoken there ?
['malayalam', 'malayalam']
where is malayali located ? 30,803,747 speakers of malayalam in kerala , making up 93.2 % of the total number of malayalam speakers in india . what other languages are spoken there ? 33,015,420 spoke the standard dialects , 19,643 spoke the yerava dialect and 31,329 spoke non - standard regional variations like eranadan . [SEP] what else is this place known for ?
['malayali', 'malayalam', 'malayalam']
where is malayali located ? 30,803,747 speakers of malayalam in kerala , making up 93.2 % of the total number of malayalam speakers in india . what other languages are spoken there ? 33,015,420 spoke the standard dialects , 19,643 spoke the yerava dialect and 31,329 spoke non - standard regional variations like eranadan . what else is this place known for ? world malayalee cou

In [30]:
import json
jsonFile = open('/content/evaluation_topics_v1.0.json', 'r')
values = json.load(jsonFile)
sentences = []
for v in values:
    turns = v['turn']
    cur = []
    for t in turns:
        s = t['raw_utterance']
        cur.append(s)
    sentences.append(cur)
jsonFile.close()
print(sentences)

[['What is throat cancer?', 'Is it treatable?', 'Tell me about lung cancer.', 'What are its symptoms? ', 'Can it spread to the throat?', 'What causes throat cancer?', 'What is the first sign of it?', 'Is it the same as esophageal cancer?', "What's the difference in their symptoms?"], ['What are the different types of sharks?', 'Are sharks endangered?  If so, which species?', 'Tell me more about tiger sharks.', 'What is the largest ever to have lived on Earth?', "What's the biggest ever caught?", 'What about for great whites?', 'Tell me about makos.', 'What are their adaptations?', 'Where do they live?', 'What do they eat?', 'How do they compare with tigers for being dangerous?'], ['Tell me about the Neverending Story film.', 'What is it about?', 'How was it received?', 'Did it win any awards?', 'Was it a book first?', 'Who was the author and when what it published?', 'What are the main themes?', 'Who are the main characters?', 'What are the differences between the book and movies?', 'D

In [31]:
updated_questions = []

for question_set in sentences :
  history_questions = ""
  cur_set = []
  for question in question_set :
    rel_list,rel_phrase = getRelevantWords(None,history_questions,question)
    history_questions += question + " "
    print(question)
    print(question + rel_phrase)
    cur_set.append(question + rel_phrase)
  updated_questions.append(cur_set)

  print("------------------------------------")




What is throat cancer?
What is throat cancer?
Is it treatable?
Is it treatable? cancer?
Tell me about lung cancer.
Tell me about lung cancer. cancer?
What are its symptoms? 
What are its symptoms?  cancer? cancer.
Can it spread to the throat?
Can it spread to the throat? cancer? cancer.
What causes throat cancer?
What causes throat cancer? cancer? cancer.
What is the first sign of it?
What is the first sign of it? cancer? cancer.
Is it the same as esophageal cancer?
Is it the same as esophageal cancer? cancer? cancer.
What's the difference in their symptoms?
What's the difference in their symptoms? cancer? cancer.
------------------------------------
What are the different types of sharks?
What are the different types of sharks?
Are sharks endangered?  If so, which species?
Are sharks endangered?  If so, which species?
Tell me more about tiger sharks.
Tell me more about tiger sharks.
What is the largest ever to have lived on Earth?
What is the largest ever to have lived on Earth? tiger

In [32]:
print(updated_questions)

[['What is throat cancer?', 'Is it treatable? cancer?', 'Tell me about lung cancer. cancer?', 'What are its symptoms?  cancer? cancer.', 'Can it spread to the throat? cancer? cancer.', 'What causes throat cancer? cancer? cancer.', 'What is the first sign of it? cancer? cancer.', 'Is it the same as esophageal cancer? cancer? cancer.', "What's the difference in their symptoms? cancer? cancer."], ['What are the different types of sharks?', 'Are sharks endangered?  If so, which species?', 'Tell me more about tiger sharks.', 'What is the largest ever to have lived on Earth? tiger', "What's the biggest ever caught?", 'What about for great whites?', 'Tell me about makos.', 'What are their adaptations?', 'Where do they live?', 'What do they eat?', 'How do they compare with tigers for being dangerous?'], ['Tell me about the Neverending Story film.', 'What is it about?', 'How was it received?', 'Did it win any awards?', 'Was it a book first?', 'Who was the author and when what it published?', 'W

In [33]:
n = 31
i = 1
with open('/content/question.txt', 'a') as f:
    for cur in sentences:
        i = 1
        for l in cur:
            line = '{"number" : "' + str(n) + '_' + str(i) + '", "text" : "#combine(' + l + ')"},\n'
            f.write(line)
            i = i + 1
        n = n + 1