# Table of Contents
1. [Imports](#Imports)
2. [Data Read In](#Data-Read-in)

## Imports
[back to top](#Table-of-Contents)

In [1]:
import argparse
import csv
import os
import pickle
import random
import sys
import unittest

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from sklearn.metrics import accuracy_score
from torch.autograd import Variable

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.has_mps:
    device = torch.device("mps")
else:
    device = torch.device("cpu")

In [21]:
# Part 1
def prepare_sequence(seq, to_ix):
    """Input: takes in a list of words, and a dictionary containing the index of the words
    Output: a tensor containing the indexes of the word"""
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)
# This is the example training data
training_data = [
    ("the dog happily ate the big apple".split(), ["DET", "NN", "ADV", "V", "DET", "ADJ", "NN"]),
    ("everybody read that good book quietly in the hall".split(), ["NN", "V", "DET", "ADJ", "NN", "ADV", "PRP", "DET", "NN"]),
    ("the old head master sternly scolded the naughty children for \
     being very loud".split(), ["DET", "ADJ", "ADJ", "NN", "ADV", "V", "DET", "ADJ",  "NN", "PRP", "V", "ADJ", "NN"]),
    ("i love you loads".split(), ["PRN", "V", "PRN", "ADV"])
]
#  These are other words which we would like to predict (within sentences) using the model
other_words = ["area", "book", "business", "case", "child", "company", "country",
               "day", "eye", "fact", "family", "government", "group", "hand", "home",
               "job", "life", "lot", "man", "money", "month", "mother", "food", "night",
               "number", "part", "people", "place", "point", "problem", "program",
               "question", "right", "room", "school", "state", "story", "student",
               "study", "system", "thing", "time", "water", "way", "week", "woman",
               "word", "work", "world", "year", "ask", "be", "become", "begin", "can",
               "come", "do", "find", "get", "go", "have", "hear", "keep", "know", "let",
               "like", "look", "make", "may", "mean", "might", "move", "play", "put",
               "run", "say", "see", "seem", "should", "start", "think", "try", "turn",
               "use", "want", "will", "work", "would", "asked", "was", "became", "began",
               "can", "come", "do", "did", "found", "got", "went", "had", "heard", "kept",
               "knew", "let", "liked", "looked", "made", "might", "meant", "might", "moved",
               "played", "put", "ran", "said", "saw", "seemed", "should", "started",
               "thought", "tried", "turned", "used", "wanted" "worked", "would", "able",
               "bad", "best", "better", "big", "black", "certain", "clear", "different",
               "early", "easy", "economic", "federal", "free", "full", "good", "great",
               "hard", "high", "human", "important", "international", "large", "late",
               "little", "local", "long", "low", "major", "military", "national", "new",
               "old", "only", "other", "political", "possible", "public", "real", "recent",
               "right", "small", "social", "special", "strong", "sure", "true", "white",
               "whole", "young", "he", "she", "it", "they", "i", "my", "mine", "your", "his",
               "her", "father", "mother", "dog", "cat", "cow", "tiger", "a", "about", "all",
               "also", "and", "as", "at", "be", "because", "but", "by", "can", "come", "could",
               "day", "do", "even", "find", "first", "for", "from", "get", "give", "go",
               "have", "he", "her", "here", "him", "his", "how", "I", "if", "in", "into",
               "it", "its", "just", "know", "like", "look", "make", "man", "many", "me",
               "more", "my", "new", "no", "not", "now", "of", "on", "one", "only", "or",
               "other", "our", "out", "people", "say", "see", "she", "so", "some", "take",
               "tell", "than", "that", "the", "their", "them", "then", "there", "these",
               "they", "thing", "think", "this", "those", "time", "to", "two", "up", "use",
               "very", "want", "way", "we", "well", "what", "when", "which", "who", "will",
               "with", "would", "year", "you", "your"]


In [25]:
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix.keys():
            word_to_ix[word] = len(word_to_ix)
for word in other_words:
    if word not in word_to_ix.keys():
        word_to_ix[word] = len(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2, "ADJ": 3, "ADV": 4, "PRP": 5, "PRN": 6}
EMBEDDING_DIM = 64
HIDDEN_DIM = 64

In [37]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_score = F.log_softmax(tag_space, dim = 1)
        return tag_score

In [38]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [43]:
# test a sentence
seq1 = "everybody read the book and ate the food".split()
seq2 = "she like my dog".split()
print("Running a sample tenset \n Sentence:\n {} \n {}".format(" ".join(seq1),
                                                               " ".join(seq2)))
with torch.no_grad():
    for seq in [seq1, seq2]:
        inputs = prepare_sequence(seq, word_to_ix)
        tag_score = model(inputs)
        max_indices = tag_score.max(dim=1)[1]
        ret = []
        # reverse tag_to_ix
        reverse_tag_index = {v: k for k, v in tag_to_ix.items()}
        for i in range(len(max_indices)):
            idx = int(max_indices[i])
            ret.append((seq[i], reverse_tag_index[idx]))
        print(ret)

Running a sample tenset 
 Sentence:
 everybody read the book and ate the food 
 she like my dog
[('everybody', 'ADJ'), ('read', 'ADJ'), ('the', 'ADJ'), ('book', 'ADJ'), ('and', 'ADJ'), ('ate', 'ADJ'), ('the', 'ADJ'), ('food', 'PRP')]
[('she', 'ADJ'), ('like', 'ADJ'), ('my', 'PRP'), ('dog', 'ADJ')]


In [44]:
# Train
losses = []
for epoch in range(300):
    count = 0
    sum_loss = 0
    for sentence, tags in training_data:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        out = model(sentence_in)
        loss = loss_function(out, targets)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        count += 1
        sum_loss += loss
        losses.append(sum_loss / count)
    print("Epoch: {}, Loss {}".format(epoch, losses[-1]))
print("Train Finished")

Epoch: 0, Loss 1.9361555576324463
Epoch: 1, Loss 1.891072154045105
Epoch: 2, Loss 1.8486526012420654
Epoch: 3, Loss 1.8079586029052734
Epoch: 4, Loss 1.7680761814117432
Epoch: 5, Loss 1.7283666133880615
Epoch: 6, Loss 1.688429355621338
Epoch: 7, Loss 1.6480318307876587
Epoch: 8, Loss 1.6070561408996582
Epoch: 9, Loss 1.5654633045196533
Epoch: 10, Loss 1.5232644081115723
Epoch: 11, Loss 1.4805009365081787
Epoch: 12, Loss 1.4372317790985107
Epoch: 13, Loss 1.393524408340454
Epoch: 14, Loss 1.349454402923584
Epoch: 15, Loss 1.305105447769165
Epoch: 16, Loss 1.2605704069137573
Epoch: 17, Loss 1.215951681137085
Epoch: 18, Loss 1.1713601350784302
Epoch: 19, Loss 1.1269131898880005
Epoch: 20, Loss 1.0827335119247437
Epoch: 21, Loss 1.0389459133148193
Epoch: 22, Loss 0.9956763982772827
Epoch: 23, Loss 0.9530502557754517
Epoch: 24, Loss 0.9111890196800232
Epoch: 25, Loss 0.8702093958854675
Epoch: 26, Loss 0.8302193880081177
Epoch: 27, Loss 0.7913168668746948
Epoch: 28, Loss 0.7535870671272278
E

In [47]:
# predict function
def predict_seq(seq_list, model):
    """

    :param seq_list: list of sequences
    :param model: NN model
    :return: tuple predictions
    """
    with torch.no_grad():
        for seq in seq_list:
            inputs = prepare_sequence(seq, word_to_ix)
            tags_score = model(inputs)
            max_indices = tags_score.max(dim=1)[1]
            pred = []
            reverse_tag_index = {v: k for k, v in tag_to_ix.items()}
            for i in range(len(max_indices)):
                idx = int(max_indices[i])
                pred.append(reverse_tag_index[idx])
            print("Sequence: {} \n"
              "Tag Prediction: {}\n".format(seq, pred))

In [48]:
# test on unkown data
predict_seq([seq1, seq2], model)

Sequence: ['everybody', 'read', 'the', 'book', 'and', 'ate', 'the', 'food'] 
Tag Prediction: ['NN', 'V', 'DET', 'NN', 'NN', 'V', 'DET', 'NN']

Sequence: ['she', 'like', 'my', 'dog'] 
Tag Prediction: ['NN', 'NN', 'V', 'NN']



## Data Read in
[back to top](#Table-of-Contents)

In [33]:
def split_text(text_file, by_line=False):
    """

    :param by_line: bool, whether to split by lines; if False, split by word
    :param text_file: training file
    :return: DIC, TOKENS and TAGS

    """
    if by_line == False:
        with open(text_file, mode="r") as file:
            text_f = file.read()
            text_f_lst = text_f.split()
            file.close()
        keys, values = text_f_lst[::2], text_f_lst[1::2]
        result_dic = dict(zip(keys, values))
        return result_dic, keys, values
    else:
        with open(text_file, mode="r") as file:
            text_f = file.read()
            text_f_lst = text_f.splitlines()
            file.close()
        keys = [line.split()[::2] for line in text_f_lst]
        values = [line.split()[1::2] for line in text_f_lst]
        # result_dic = dict(zip(keys, values))
        return keys, values
# create a list of list of tuples for training data
def combine_lists(vocab_list, tags_list):
    """

    :param vocab_list: list of sentence
    :param tags_list:
    :return: list of list of sentence of words tuples e.g. [[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.')]]
    """
    result = []
    for i in range(len(vocab_list)):
        sentence, tags = vocab_list[i], tags_list[i]
        zipped = zip(sentence, tags)
        result.append(list(zipped))
    return result

In [34]:
vocab_list, tags_list = split_text("wsj1-18.training", by_line=True)
train_list = combine_lists(vocab_list, tags_list)

### Construct dictionary
1. A word/tag dictionary
2. A letter/character dictionary
3. A POS tag dictionary


In [11]:
def word_to_idx(word, ix):


In [32]:
[print(i) for i in range(4)]

0
1
2
3


[None, None, None, None]