<a href="https://colab.research.google.com/github/jjcremer/NLP_Masters/blob/main/nlp243_assignment2_1542248.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import cross_entropy
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset
import torchtext.legacy
#from torchtext.data import Field, TabularDataset, BucketIterator

import pandas as pd
import regex as re
from sklearn.preprocessing import LabelBinarizer
from collections import defaultdict
#!pip install fasttext
#import fasttext
#import fasttext.util
#fasttext.util.download_model('en', if_exists='ignore')
#embedding_model = fasttext.load_model('cc.en.300.bin')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!pip install poutyne
from poutyne import set_seeds
from poutyne.framework import Experiment



In [None]:
class JUtils():
  def get_dist_df(list):
    fdist = dict(nltk.FreqDist(list))
    fdist = pd.DataFrame.from_dict(fdist, orient='index')
    fdist.columns = ['Frequency']
    fdist.index.name = 'Class'
    fdist = fdist.sort_values(by=['Frequency'], ascending=False)
    return fdist

In [None]:
class Preprocessor():
  def __innit__(self):
    self.lb = LabelBinarizer()

  def get_data(self, filename='train_data.csv', vectorize=True, class_column='IOB Slot tags'):
    self.class_column=class_column
    tdf = pd.read_csv(filename)
    if vectorize:
      for i, row in tdf.iterrows():
        tdf.at[i, class_column] = row[class_column].split()
    self.raw_train_data = tdf
    return self.raw_train_data

  def split_data(self, df, split=0.1):
    dev_set = df.sample(frac = split)
    dev_set.reset_index(inplace=True, drop=True)
    train_set = df.drop(dev_set.index)
    train_set.reset_index(inplace=True, drop=True)
    return train_set, dev_set

  def format_IOB_data(self, df):
    self.class_fdist = dict()
    self.class_merge = dict()
    for tags in df[self.class_column]:
      for tag in tags:
        merged_class = re.match('(I|B)_(\w+)', tag)
        self.class_merge[tag] = (merged_class != None and merged_class.group(2)) or tag
    df[self.class_column] = df[self.class_column].apply(self.__reduce_classes__)
    return df

  def __reduce_classes__(self, classes):
    new_classes = []
    for i in classes:
      new_classes.append(self.class_merge[i])
      self.class_fdist[self.class_merge[i]] = self.class_fdist.get(self.class_merge[i], 0) + 1
    return new_classes
  
  def __remove_classes_below_threshold__(self, df, threshold=5):
    self.class_merge = dict()
    new_class = 'other'
    for original_class, row in self.original_class_fdist.iterrows():
      if (row[0] > threshold):
        self.class_merge[original_class] = original_class
      else:
        self.class_merge[original_class] = new_class
    df[self.class_column] = df[self.class_column].apply(lambda x: self.class_merge[x])
    return df

  def _tokenize_utterance(self, utterance):
    return word_tokenize(utterance)

  def classify_relation(self, df):
    self.relation_to_class = dict()
    index = 0
    for relation in df[self.class_column]:
      if relation not in self.relation_to_class:
        self.relation_to_class[relation] = index
        index+=1
    df[self.class_column] = df[self.class_column].apply(lambda x: self.relation_to_class[x])
    return df

  def tokenize_label(self, str):
    return self.relation_to_class[str]

  def format_relations_data(self, df):
      self.original_class_fdist = JUtils.get_dist_df(df[self.class_column])
      df = self.__remove_classes_below_threshold__(df)
      df['utterances'] = df['utterances'].apply(lambda x: self._tokenize_utterance(x))
      return df

In [None]:
class Vectorizer:
  def __init__(self, embedding_model):
    self.embedding_model = embedding_model
  
  def __call__(self, utterance):
    embeddings = []
    for token in utterance:
      embeddings.append(self.embedding_model[token])
    return embeddings

In [None]:
class TagDataset(Dataset):
  def __init__(self, path):
    pp = Preprocessor()
    data = pp.get_data(filename=path, vectorize=False, class_column='Core Relations')
    #new_data = pp.format_relations_data(data)
    self.data = pp.classify_relation(data)

  def __len__(self):
    return len(self.data.index)

  def __getitem__(self, index):
    data = self.data.iloc[index]
    return {'text': data['utterances'], 'label': data['Core Relations']}

#bla = TagDataset('train.csv')

In [None]:
device = torch.device("cuda:0")
pp = Preprocessor()
data = pp.get_data(filename='train_data_merged_labels.csv', vectorize=False, class_column='Core Relations')
data = pp.classify_relation(data)
train_data, dev_data = pp.split_data(data)
train_data.to_csv('train.csv', index=False)
dev_data.to_csv('valid.csv', index=False)

In [None]:
train = TagDataset('train.csv')
valid = TagDataset('valid.csv')

train_dataloader = DataLoader(train, batch_size=32, shuffle=True)
valid_dataloader = DataLoader(valid, batch_size=32, shuffle=False)

train_iter, valid_iter = torchtext.legacy.data.BucketIterator.splits(
    (train_dataloader, valid_dataloader),
    batch_sizes=(32,32),
    sort_key=lambda x: len(x['text']),
    repeat=True,
    sort=False,
    shuffle=True,
    sort_within_batch=True,
    device=device
)

#text_field = torchtext.legacy.data.Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True)
#text_field.build_vocab(train, min_freq=3)

In [None]:
text_tokenizer = word_tokenize
label_tokenizer = lambda x: float(x)

#label_field = torchtext.legacy.data.Field(sequential=False, dtype=torch.float, lower=False)
#text_field = torchtext.legacy.data.Field(sequential=True, tokenize=text_tokenizer, use_vocab=False)

label_field = torchtext.legacy.data.Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.long,tokenize=label_tokenizer)
text_field = torchtext.legacy.data.Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True)

datafields = [('text', text_field),
              ('label', label_field)]

train, valid = torchtext.legacy.data.TabularDataset.splits(
    path='',
    train='train.csv',
    validation='valid.csv',
    format='CSV',
    fields=datafields, 
    skip_header=True
)

train_iter, valid_iter = torchtext.legacy.data.BucketIterator.splits(
    (train, valid),
    batch_sizes=(32,32),
    sort_key=lambda x: len(x.text),
    repeat=True,
    sort=False,
    shuffle=True,
    sort_within_batch=True
    device=device
)

text_field.build_vocab(train, min_freq=3)

train_iter.create_batches()

In [None]:
class DatasetBucket:
  def __init__(self, data, embedding_vectorizer, label_binarizer, relation_to_class):
    self.data = data
    self.embedding_vectorizer = embedding_vectorizer
    self.label_binarizer = label_binarizer
    self.relation_to_class = relation_to_class

  def __len__(self):
    return len(self.data.index)

  def __getitem__(self, index):
    data = self.data.iloc[index]
    return self._item_vectorizing(data)

  def _item_vectorizing(self, item):
    utterance = item['utterances']
    utterance_vec = self.embedding_vectorizer(utterance)

    core_relation = item['Core Relations']
    core_relation_vec = self._relation_to_values(core_relation)

    return utterance_vec, core_relation_vec

  def _relation_to_values(self, core_relation):
    return self.relation_to_class[core_relation]

  def _relation_to_vec(self, core_relation):
    return self.label_binarizer.transform([core_relation])

In [None]:
destination_folder = './lstm/results'

In [None]:
class LSTM(nn.Module):
  def __init__(self, dimension=300):
    super(LSTM, self).__init__()
    
    self.embedding = nn.Embedding(len(text_field.vocab), 300)
    self.dimension=dimension
    self.lstm = nn.LSTM(input_size=300,
                        hidden_size=dimension,
                        num_layers=1,
                        batch_first=True,
                        bidirectional=True)
    
    self.drop = nn.Dropout(p=0.5)
    self.fc = nn.Linear(2*dimension, 47)

  def forward(self, text, text_len):

    text_emb = self.embedding(text)

    packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
    packed_output, _ = self.lstm(packed_input)
    output, _ = pad_packed_sequence(packed_output, batch_first = True)

    out_forward = output[range(len(output)), text_len - 1, :self.dimension]
    out_reverse = output[:, 0, self.dimension:]
    out_reduced = torch.cat((out_forward, out_reverse), 1)
    text_fea = self.drop(out_reduced)

    text_fea = self.fc(text_fea)
    #text_out = torch.squeeze(text_fea, 31)
    #text_out = torch.sigmoid(text_fea)

    return text_fea.transpose(-1, 1)

In [None]:
def save_checkpoint(save_path, model, optimizer, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [None]:
def train(model,
          optimizer,
          criterion=cross_entropy,
          train_loader=train_iter,
          valid_loader=valid_iter,
          num_epochs=5,
          eval_every=len(train_iter),
          file_path='./',
          best_valid_loss=float('Inf')):
  
  running_loss = 0.0
  valid_running_loss = 0.0
  global_step = 0
  train_loss_list = []
  valid_loss_list = []
  global_steps_list = []

  train_loader.create_batches()

  model.train()
  for epoch in range(num_epochs):
    for batch in train_loader:
      text = batch.text[0]
      label = batch.label

      output = model(text, batch.text[1])

      loss = criterion(output, label)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      running_loss += loss.item()
      global_step += 1

      if global_step % eval_every == 0:
        model.eval()
        with torch.no_grad():
          for batch in valid_loader:
            text = batch.text[0]
            label = batch.label

            output = model(text, batch.text[1])

            loss = criterion(output, label)
            valid_running_loss += loss.item()
        
        average_train_loss = running_loss / eval_every
        average_valid_loss = valid_running_loss / len(valid_loader)
        train_loss_list.append(average_train_loss)
        valid_loss_list.append(average_valid_loss)
        global_steps_list.append(global_step)

        running_loss = 0.0
        valid_running_loss = 0.0
        model.train()

        print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
        
        if best_valid_loss > average_valid_loss:
          best_valid_loss = average_valid_loss
          save_checkpoint('model.pt', model, optimizer, best_valid_loss)
          save_metrics('metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

model = LSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(model=model, optimizer=optimizer, num_epochs=10)

In [None]:
def pad_collate_fn(batch):
  things = [(torch.FloatTensor(seq_vectors), torch.LongTensor(labels), len(seq_vectors))
   for (seq_vectors, labels) in sorted(batch, key=lambda x: len(x[0]), reverse=True)]

  utterance_vectors = [thing[0] for thing in things]
  label_vectors = [thing[1] for thing in things]
  lengths = [thing[2] for thing in things]

  lengths = torch.LongTensor(lengths)
  padded_utterance_vectors = pad_sequence(utterance_vectors, batch_first=True, padding_value=0)

  label_vectors = torch.stack(list(label_vectors), dim=0)
  return (padded_utterance_vectors, lengths), label_vectors

In [None]:
class RecurrentNet(nn.Module):
  def __init__(self, lstm_network, fully_connected_network):
    super().__init__()

    self.hidden_states = None
    self.lstm_network = lstm_network
    self.fully_connected_network = fully_connected_network

  def forward(self, padded_utterance_vectors, lengths):
    total_length = padded_utterance_vectors.shape[1]

    pack_padded_utterance_vectors = pack_padded_sequence(padded_utterance_vectors, lengths.cpu(), batch_first=True)
    lstm_out, self.hidden_states = self.lstm_network(pack_padded_utterance_vectors)

    lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True, total_length=total_length)

    out_forward = lstm_out[range(len(lstm_out)), total_length - 1, :300]

    tag_space = self.fully_connected_network(out_forward)

    return tag_space.transpose(-1, 1)

#full_network = RecurrentNet(lstm_network, fully_connected_network)

In [None]:
pp = Preprocessor()
data = pp.get_data(filename='train_data_merged_labels.csv', vectorize=False, class_column='Core Relations')
new_data = pp.format_relations_data(data)
relation_to_class = pp.classify_relation(new_data)
train_data, dev_data = pp.split_data(new_data)

vectorizer = Vectorizer(embedding_model)
label_binarizer = LabelBinarizer()
label_binarizer.fit(train_data['Core Relations'])
train_dataset_vectorizer = DatasetBucket(train_data, vectorizer, label_binarizer=label_binarizer, relation_to_class=relation_to_class)
valid_dataset_vectorizer = DatasetBucket(dev_data, vectorizer, label_binarizer=label_binarizer, relation_to_class=relation_to_class)

utterance, label = train_dataset_vectorizer[0]

dimension = 300
num_layer = 1
bidirectional = False

lstm_network = nn.LSTM(input_size=dimension,
                       hidden_size=dimension,
                       num_layers=num_layer,
                       bidirectional=bidirectional,
                       batch_first=True)

input_dim = dimension
tag_dimension = 1

fully_connected_network = nn.Linear(input_dim, tag_dimension)


device = torch.device("cuda:0")

batch_size = 128
lr = 0.1

epoch_number = 10

set_seeds(42)

train_loader = DataLoader(train_dataset_vectorizer, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn, num_workers=4)
valid_loader = DataLoader(valid_dataset_vectorizer, batch_size=batch_size, collate_fn=pad_collate_fn, num_workers=4)

full_network = RecurrentNet(lstm_network, fully_connected_network)

optimizer = optim.SGD(full_network.parameters(), lr)

exp = Experiment("./", full_network, device=device, optimizer=optimizer,
                 loss_function=cross_entropy, batch_metrics=["acc"])

exp.train(train_loader, valid_generator=valid_loader, epochs=epoch_number)
#JUtils.get_dist_df(new_data['Core Relations'])
#pp.original_class_fdist 

  cpuset_checked))


RuntimeError: ignored

In [None]:
pd.read_csv('train_data.csv')

Unnamed: 0,ID,utterances,IOB Slot tags
0,1,who plays luke on star wars new hope,O O B_char O B_movie I_movie I_movie I_movie
1,2,show credits for the godfather,O O O B_movie I_movie
2,3,who was the main actor in the exorcist,O O O O O O B_movie I_movie
3,4,find the female actress from the movie she 's ...,O O O O O O O B_movie I_movie I_movie I_movie
4,5,who played dory on finding nemo,O O B_char O B_movie I_movie
...,...,...,...
2307,2308,what was the revenue for toy story 3,O O O O O B_movie I_movie I_movie
2308,2309,dark knight revenue,B_movie I_movie O
2309,2310,how much did the dark night generate,O O O B_movie I_movie I_movie O
2310,2311,can i see the lion king 's revenue,O O O B_movie I_movie I_movie O O
