In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import torch
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple
import gensim.downloader
from torch.optim import Adam
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import pickle
from torch import nn

In [2]:
avilable_languages = ['eng','fin','jap']
current_language = 'eng'

size_train = 2000
size_valid = 1000

seed = 5555

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd
%cd /content/drive/MyDrive/Colab Notebooks/nlp2/nlptasks/notebooks/hroi-temp

/root
/content/drive/MyDrive/Colab Notebooks/nlp2/nlptasks/notebooks/hroi-temp


In [5]:
!pip install bpemb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bpemb
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 59.5 MB/s 
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.4 sentencepiece-0.1.97


In [6]:
from bpemb import BPEmb

In [7]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [8]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

In [9]:
enforce_reproducibility()

In [10]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

In [41]:
def accuracy(logits, labels):
  logits = np.asarray(logits).reshape(-1, len(logits[0]))
  labels = np.asarray(labels).reshape(-1)
  return np.sum(np.argmax(logits, axis=-1) == labels).astype(np.float32) / float(labels.shape[0])

In [151]:
def evaluate(model: nn.Module, test_dl: DataLoader):
  """
  Evaluates the model on the given dataset
  :param model: The model under evaluation
  :param valid_dl: A `DataLoader` reading validation data
  :return: The accuracy of the model on the dataset
  """
  # VERY IMPORTANT: Put your model in "eval" mode -- this disables things like 
  # layer normalization and dropout
  model.eval()
  labels_all = []
  logits_all = []

  # ALSO IMPORTANT: Don't accumulate gradients during this process
  with torch.no_grad():
    for batch in tqdm(test_dl, desc='Evaluation'):
      batch = tuple(t.to(device) for t in batch)
      input_ids = batch[0]
      seq_lens = batch[1]
      labels = batch[2]
      meta_inputs = batch[3]

      _, logits = model(input_ids, seq_lens, labels,meta_inputs)
      labels_all.extend(list(labels.detach().cpu().numpy()))
      logits_all.extend(list(logits.detach().cpu().numpy()))
    acc = accuracy(logits_all, labels_all)

    return acc,labels_all,logits_all

In [13]:
def train(
    model: nn.Module, 
    train_dl: DataLoader, 
    valid_dl: DataLoader, 
    optimizer: torch.optim.Optimizer, 
    n_epochs: int, 
    device: torch.device,
    patience: int = 10
):
  """
  The main training loop which will optimize a given model on a given dataset
  :param model: The model being optimized
  :param train_dl: The training dataset
  :param valid_dl: A validation dataset
  :param optimizer: The optimizer used to update the model parameters
  :param n_epochs: Number of epochs to train for
  :param device: The device to train on
  :return: (model, losses) The best model and the losses per iteration
  """

  # Keep track of the loss and best accuracy
  losses = []
  best_acc = 0.0
  pcounter = 0

  # Iterate through epochs
  for ep in range(n_epochs):

    loss_epoch = []

    #Iterate through each batch in the dataloader
    for batch in tqdm(train_dl):
      # VERY IMPORTANT: Make sure the model is in training mode, which turns on 
      # things like dropout and layer normalization
      model.train()

      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
      # keeps track of these dynamically in its computation graph so you need to explicitly
      # zero them out
      optimizer.zero_grad()

      # Place each tensor on the GPU
      batch = tuple(t.to(device) for t in batch)
      input_ids = batch[0]
      seq_lens = batch[1]
      labels = batch[2]
      meta_inputs = batch[3]

      # Pass the inputs through the model, get the current loss and logits
      loss, logits = model(input_ids, seq_lens, labels, meta_inputs)
      losses.append(loss.item())
      loss_epoch.append(loss.item())
      
      # Calculate all of the gradients and weight updates for the model
      loss.backward()

      # Optional: clip gradients
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Finally, update the weights of the model
      optimizer.step()
      #gc.collect()

    # Perform inline evaluation at the end of the epoch
    acc,_,_ = evaluate(model, valid_dl)
    print(f'Validation accuracy: {acc}, train loss: {sum(loss_epoch) / len(loss_epoch)}')

    # Keep track of the best model based on the accuracy
    if acc > best_acc:
      torch.save(model.state_dict(), 'best_model')
      best_acc = acc
      pcounter = 0
    else:
      pcounter += 1
      if pcounter == patience:
        break
        #gc.collect()

  model.load_state_dict(torch.load('best_model'))
  return model, losses

In [14]:
# use eval to change string list to list get the length of each row
def create_batch(df, meta):
  combined = [eval(que) + eval(doc) for que, doc in zip(df["question_text_tokenized"],df["document_plaintext_tokenized"])]
  flat_combined = np.array([item for sublist in combined for item in sublist])

  dictionary = Counter(flat_combined) # create dict
  dictionary_sorted = sorted(dictionary, key=dictionary.get, reverse=True) # sorted dict

  text_id = [[dictionary_sorted.index(word) for word in text] for text in combined] # indexes of doc location

  text_lens = [len(i) for i in combined] # length of all doc lists
  max_length = max(text_lens) # max length of doc
  dictionary_sorted.append('[PAD]') # add padding token to dictionary
  pad_index = dictionary_sorted.index('[PAD]') # get pad index
  print('pad_index: ',pad_index)

  padded_text_id = [(i + [pad_index] * (max_length - len(i))) for i in text_id] # ad padding to all tex id

  return padded_text_id, text_lens, df.label.values, meta
  

In [15]:
# Define a default lstm_dim
lstm_dim = 100

# Define some hyperparameters
batch_size = 2
lr = 3e-6
n_epochs = 100

In [16]:
# import training data
dft = pd.read_csv('../../data/dft_'+current_language+'.csv')

# import validation data
dfv = pd.read_csv('../../data/dfv_'+current_language+'.csv')

In [17]:
arange_dft = np.arange(len(dft))
np.random.seed(seed)
np.random.shuffle(arange_dft)

dft = dft.iloc[arange_dft]

train_df = dft.iloc[:size_train]
valid_df = dft.iloc[size_train:size_train+size_valid]
test_df = dfv

train_meta = train_df[['word_frequency_score','logres_pred']].values
valid_meta = valid_df[['word_frequency_score','logres_pred']].values
test_meta = test_df[['word_frequency_score','logres_pred']].values

In [18]:
train_dataset = create_batch(train_df,train_meta) # 500 for training
valid_dataset = create_batch(valid_df,valid_meta) # 500 for validation
test_dataset = create_batch(test_df,test_meta) # rest of training for testing

pad_index:  31371
pad_index:  20202
pad_index:  19204


In [19]:
#collate_fn=collate_batch_bilstm
def collate_batch_bilstm_t(data):
    #return torch.tensor(train_dataset[0]), torch.tensor(train_dataset[1]), torch.tensor(train_dataset[2])
    return torch.tensor([f for f in train_dataset[0]]), torch.tensor([f for f in train_dataset[1]]), torch.tensor([f for f in train_dataset[2]]), torch.tensor([f for f in train_dataset[3]]).float()

def collate_batch_bilstm_v(data):
    #return torch.tensor(train_dataset[0]), torch.tensor(train_dataset[1]), torch.tensor(train_dataset[2])
    return torch.tensor([f for f in valid_dataset[0]]), torch.tensor([f for f in valid_dataset[1]]), torch.tensor([f for f in valid_dataset[2]]), torch.tensor([f for f in valid_dataset[3]]).float()

def collate_batch_bilstm_test(data):
    #return torch.tensor(train_dataset[0]), torch.tensor(train_dataset[1]), torch.tensor(train_dataset[2])
    return torch.tensor([f for f in test_dataset[0]]), torch.tensor([f for f in test_dataset[1]]), torch.tensor([f for f in test_dataset[2]]), torch.tensor([f for f in test_dataset[3]]).float()


In [20]:
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,collate_fn=collate_batch_bilstm_t, num_workers=2)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,collate_fn=collate_batch_bilstm_v, num_workers=2)
test_dl = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_batch_bilstm_test, num_workers=2)

In [21]:
bpemb_en = BPEmb(lang='en', dim=100, vs=65320)
# Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token
pretrained_embeddings = np.concatenate([bpemb_en.emb.vectors, np.zeros(shape=(1,100))], axis=0)

BPEmb fallback: en from vocab size 65320 to 200000
downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs200000.model


100%|██████████| 3776868/3776868 [00:01<00:00, 2440614.41B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs200000.d100.w2v.bin.tar.gz


100%|██████████| 75946121/75946121 [00:08<00:00, 9104984.62B/s] 


In [26]:
# Define the model
class BiLSTMNetwork(nn.Module):
    """
    Basic BiLSTM network
    """
    def __init__(
            self,
            #meta_data,
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2
    ):
        """
        Initializer for basic BiLSTM network
        :param pretrained_embeddings: A tensor containing the pretrained BPE embeddings
        :param lstm_dim: The dimensionality of the BiLSTM network
        :param dropout_prob: Dropout probability
        :param n_classes: The number of output classes
        """

        # First thing is to call the superclass initializer
        super(BiLSTMNetwork, self).__init__()

        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer
        # The components are an embedding layer, a 2 layer BiLSTM, and a feed-forward output layer
        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pretrained_embeddings.shape[0] - 1),
            'bilstm': nn.LSTM(
                pretrained_embeddings.shape[1],
                lstm_dim,
                1,
                batch_first=True,
                dropout=dropout_prob,
                bidirectional=True),
            'cls': nn.Linear(2*lstm_dim + 2, n_classes)
        })
        self.n_classes = n_classes
        self.dropout = nn.Dropout(p=dropout_prob)

        # Initialize the weights of the model
        self._init_weights()

        print(self.model['embeddings'])

    def _init_weights(self):
        all_params = list(self.model['bilstm'].named_parameters()) + \
                     list(self.model['cls'].named_parameters())
        for n,p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

    def forward(self, inputs, input_lens, labels, meta_input):
        """
        Defines how tensors flow through the model
        :param inputs: (b x sl) The IDs into the vocabulary of the input samples
        :param input_lens: (b) The length of each input sequence
        :param labels: (b) The label of each sample
        :return: (loss, logits) if `labels` is not None, otherwise just (logits,)
        """

        # Get embeddings (b x sl x edim)
        print(self.model['embeddings'])
        embeds = self.model['embeddings'](inputs)

        # Pack padded: This is necessary for padded batches input to an RNN
        lstm_in = nn.utils.rnn.pack_padded_sequence(
            embeds,
            input_lens.cpu(),
            batch_first=True,
            enforce_sorted=False
        )

        # Pass the packed sequence through the BiLSTM
        lstm_out, hidden = self.model['bilstm'](lstm_in)

        # Unpack the packed sequence --> (b x sl x 2*lstm_dim)
        lstm_out,_ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

        #cat = (torch.cat((self.meta_input, lstm_out)),)
        #print(cat.shape)
        #print(lstm_out.shape)
        #print(lstm_out)

        # Max pool along the last dimension
        ff_in = self.dropout(torch.max(lstm_out, 1)[0])
        # Some magic to get the last output of the BiLSTM for classification (b x 2*lstm_dim)
        #ff_in = lstm_out.gather(1, input_lens.view(-1,1,1).expand(lstm_out.size(0), 1, lstm_out.size(2)) - 1).squeeze()

        ff_in = torch.cat((ff_in, meta_input), 1)

        # Get logits (b x n_classes)
        logits = self.model['cls'](ff_in).view(-1, self.n_classes)
        outputs = (logits,)
        
        if labels is not None:
            # Xentropy loss
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            outputs = (loss,) + outputs

        return outputs


# Create the model
model = BiLSTMNetwork(
    pretrained_embeddings= torch.FloatTensor(pretrained_embeddings), 
    lstm_dim=lstm_dim, 
    dropout_prob=0.1, 
    n_classes=2
  ).to(device)

  "num_layers={}".format(dropout, num_layers))


Embedding(200001, 100, padding_idx=200000)


In [23]:
# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)

# Train
model, losses = train(model, train_dl, valid_dl, optimizer, n_epochs, device)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/2 [00:00<?, ?it/s]

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


Embedding(200001, 100, padding_idx=200000)


RuntimeError: ignored

In [None]:
import matplotlib.pyplot as plt

plt.plot(losses)

In [96]:
val_acc,_,_ = evaluate(model, valid_dl)
test_acc,labs,logs = evaluate(model, test_dl)
print(f"Valiation accuracy: {val_acc}")
print(f"Test accuracy: {test_acc}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


Evaluation:   0%|          | 0/2 [00:00<?, ?it/s]

  
  


Embedding(200001, 100, padding_idx=200000)
Embedding(200001, 100, padding_idx=200000)
[[-1.111364    0.72312546]
 [ 0.84544736 -0.94396573]
 [-1.0878974   0.9034119 ]
 ...
 [-0.3464332   0.10377127]
 [-0.29444358  0.2989619 ]
 [-0.0496568  -0.20536467]]
[0 0 1 ... 1 1 0]


Evaluation:   0%|          | 0/2 [00:00<?, ?it/s]

  if sys.path[0] == '':
  if sys.path[0] == '':


Embedding(200001, 100, padding_idx=200000)
Embedding(200001, 100, padding_idx=200000)
[[-0.05250189 -0.37451968]
 [-0.15203735  0.20665482]
 [-0.62011105  0.26049015]
 ...
 [-0.5426405   0.49618497]
 [ 0.26896137 -0.46655706]
 [ 0.41786507 -0.8063421 ]]
[1 1 1 ... 0 0 0]
Valiation accuracy: 0.664
Test accuracy: 0.6818181818181818


In [None]:
torch.save(model, '../../models/bi_lstm_2b_'+current_language+'.pth')

In [152]:
model = torch.load('../../models/bi_lstm_2b_'+current_language+'.pth')

In [181]:
  model.eval()
  labels_all = []
  logits_all = []

  # ALSO IMPORTANT: Don't accumulate gradients during this process
  with torch.no_grad():
    for batch in tqdm(test_dl, desc='Evaluation'):
      batch = tuple(t.to(device) for t in batch)
      input_ids = batch[0]
      seq_lens = batch[1]
      labels = batch[2]
      meta_inputs = batch[3]

      _, logits = model(input_ids, seq_lens, labels,meta_inputs)
      labels_all.extend(list(labels.detach().cpu().numpy()))
      logits_all.extend(list(logits.detach().cpu().numpy()))
      break
    acc = accuracy(logits_all, labels_all)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


Evaluation:   0%|          | 0/2 [00:00<?, ?it/s]

  if sys.path[0] == '':
  if sys.path[0] == '':


Embedding(200001, 100, padding_idx=200000)
[[-0.05250189 -0.37451968]
 [-0.15203735  0.20665482]
 [-0.62011105  0.26049015]
 ...
 [-0.5426405   0.49618497]
 [ 0.26896137 -0.46655706]
 [ 0.41786507 -0.8063421 ]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [184]:
pred_labels = (np.argmax(logits_all, axis=-1) == labels_all).astype(int)

In [185]:
pred_labels

array([0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,