<a href="https://colab.research.google.com/github/kiarashk76/DL4NLP-1st-Assignment/blob/master/Char_POS_no_att.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install flair

# Custom Preprocessing

In [0]:
import flair.datasets
from flair.data import Sentence

downsampled_corpus = flair.datasets.WIKINER_ENGLISH().downsample(0.01)
corpus = flair.datasets.WIKINER_ENGLISH()
print(len(downsampled_corpus.train))


In [0]:
# number of apostrophies in Data
c = 0 
for sentence in corpus.train: 

  for token in sentence:
    if "'" in token.text:
      c += 1
print(c)


In [0]:
def apostrophe_handler(sentence):
''' handling apostrophe usage for am, is, are '''
  if type(sentence) == str:
    result = sentence.replace("'m", " am")
    result = result.replace("'s", " is")
    result = result.replace("'re", " are")
    return result
  elif type(sentence) == flair.data.Sentence:
    result = ""
    for token in sentence:
      result += token.text + " "
    result = result.replace("'m", " am")
    result = result.replace("'s", " is")
    result = result.replace("'re", " are")
    return result


  
print(apostrophe_handler(Sentence("I'm kiarash, he's ivan, they're studying")))
print(apostrophe_handler("I'm kiarash, he's ivan, they're studying"))

In [0]:
import random
def char_to_idx(dataset):
  '''Given the whole dataset it will extract the char to indices dictionary'''
  id = 1
  char_to_id_dict = {" ":0}
  for sentence in dataset:
    for token in sentence:
      for char in token.text:
        if char not in char_to_id_dict:
          char_to_id_dict[char] = id
          id += 1
  return char_to_id_dict

def sentence_to_idx(sentence, char_to_id_dic, handling_apostrophe = False):
  result = []
  if handling_apostrophe:
    sentence = apostrophe_handler(sentence)

  if type(sentence) == str:
    for char in sentence:
      result.append(char_to_id_dict[char])
    return result
  
  elif type(sentence) == flair.data.Sentence:
    for token in sentence:
      for char in token.text:
        result.append(char_to_id_dict[char])
      result.append(char_to_id_dict[" "]) # adding a " " token between each token in the sequence
    return result

def dataset_to_idx(dataset, handling_apostrophe = False):
''' Return a list of lists of indices for each sentence'''
    result = []
    char_to_id_dict = char_to_idx(dataset)
    for sentence in dataset:
      result.append(sentence_to_idx(sentence, char_to_id_dict, handling_apostrophe))
    return result
  
def dataset_to_idx_generator(dataset, handling_apostrophe = False):
''' Data generator for Pytorch input'''
    char_to_id_dict = char_to_idx(dataset)
    for sentence in dataset:
      yield sentence_to_idx(sentence, char_to_id_dict, handling_apostrophe)

char_to_id_dict = char_to_idx(downsampled_corpus.train)

# a string object sentence to a list of indices test
print(sentence_to_idx("Hello World!", char_to_id_dict))
print("***")

# a random Sentence object sentence to a list of indices test
random_index = random.randint(0, len(downsampled_corpus.train))
print(downsampled_corpus.train[random_index])
print(sentence_to_idx(downsampled_corpus.train[random_index], char_to_id_dict))
print("***")

# whole dataset to a list of indices test
print(dataset_to_idx(downsampled_corpus.train))
print("***")

# whole dataset to a list generator test
data_generator = dataset_to_idx_generator(downsampled_corpus.train)
print(next(data_generator))
print("***")

## Imports

In [0]:
import numpy as np
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import flair.datasets


## Preprocessing

In [0]:
word_to_idx = {}
char_to_idx = {}

def prepare_sequence(seq, to_idx):
    return [(to_idx[w], [char_to_idx[c] for c in w]) for w in seq]

def prepare_target(seq, to_idx):
    return autograd.Variable(torch.LongTensor([to_idx[w] for w in seq]))

training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

for sent, tags in training_data:
    for word in sent:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
        for char in word:
            if char not in char_to_idx:
                char_to_idx[char] = len(char_to_idx)


tag_to_idx = {"DET": 0, "NN": 1, "V": 2}
idx_to_tag = {0: "DET", 1: "NN", 2: "V"}

## Model

In [0]:
CAR_EMBEDDING_DIM = 3
WORD_EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class LSTMTagger(nn.Module):

    def __init__(self, word_embedding_dim, char_embedding_dim, hidden_dim, vocab_size, alphabet_size, tagset_size):

        super(LSTMTagger, self).__init__()

        self.hidden_dim = hidden_dim
        self.char_embedding_dim = char_embedding_dim

        self.char_embeddings = nn.Embedding(alphabet_size, char_embedding_dim)
        self.lstm_char = nn.LSTM(char_embedding_dim, char_embedding_dim)

        self.word_embeddings = nn.Embedding(vocab_size, word_embedding_dim)
        self.lstm_word = nn.LSTM(word_embedding_dim+char_embedding_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

        self.hidden = self.init_hidden(hidden_dim)
        self.hidden_char = self.init_hidden(CAR_EMBEDDING_DIM)

    def init_hidden(self, dim):

        return (autograd.Variable(torch.zeros(1, 1, dim)),
                autograd.Variable(torch.zeros(1, 1, dim)))

    def forward(self, sentence):
        word_idxs = []
        lstm_char_result = []
        for word in sentence:
            self.hidden_char = self.init_hidden(CAR_EMBEDDING_DIM)
            word_idxs.append(word[0])           
            char_idx = autograd.Variable(torch.LongTensor(word[1]))
            char_embeds = self.char_embeddings(char_idx)
            lstm_char_out, self.hidden_char = self.lstm_char(char_embeds.view(len(word[1]), 1, CAR_EMBEDDING_DIM), self.hidden_char)
            lstm_char_result.append(lstm_char_out[-1])

        lstm_char_result = torch.stack(lstm_char_result)
        
        word_embeds = self.word_embeddings(autograd.Variable(torch.LongTensor(word_idxs))).view(len(sentence), 1, WORD_EMBEDDING_DIM)

        lstm_in = torch.cat((word_embeds, lstm_char_result), 2)

        lstm_out, self.hidden = self.lstm_word(lstm_in, self.hidden)

        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

## Training

In [8]:
model = LSTMTagger(WORD_EMBEDDING_DIM, CAR_EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(char_to_idx), len(tag_to_idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(300):  
    for sentence, tags in training_data:
        model.zero_grad()

        model.hidden = model.init_hidden(HIDDEN_DIM)

        sentence_in = prepare_sequence(sentence, word_to_idx)
        
        targets = prepare_target(tags, tag_to_idx)

        tag_scores = model(sentence_in)

        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()


# ======================= TEST

test_sentence = training_data[0][0]
inputs = prepare_sequence(test_sentence, word_to_idx)
tag_scores = model(inputs)
for i in range(len(test_sentence)):
    print('{}: {}'.format(test_sentence[i], idx_to_tag[np.argmax(tag_scores[i].data.numpy())]))



The: DET
dog: NN
ate: V
the: DET
apple: NN
