In [3]:
import argparse
import csv
import os
import os
import pickle
import random
import sys
import unittest

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
from sklearn.metrics import accuracy_score
from torch.autograd import Variable
def load_embedding(filename='glove.6B.50d.txt'):
    """
    Load embedding for the training and
    :return: dataframe, words
    """
    # creat column names
    num = np.arange(51)
    num_str = list(map(str, num))
    list_name = list(map(lambda x: "dim_" + x, num_str))
    df = pd.read_csv("glove.6B.50d.txt", sep=" ", quoting=csv.QUOTE_NONE,
                     header=None, encoding='utf-8',
                     names=list_name)
    df.rename({'dim_0': 'token'}, axis=1, inplace=True)
    words = df.token.to_list()
    # add padding embedding
    df.loc['<PAD>'] = np.zeros(50)
    df.set_index('token', inplace=True)
    df.to_pckle("glove.pkl")
    return df, words
def word_to_embedding(target_vocab, pre_train):
    """

    :param pre_train: pd.DataFrame pre-trained dataframe
    :param target_vocab: list/ array of tokens need to be transformed
    :return: transformed matrix, result dictionary for the unique tokens
    """
    matrix_len = len(target_vocab)
    weighted_matrix = np.zeros((matrix_len + 1, 50))
    words_found = 0
    for i, word in enumerate(target_vocab):
        try:
            weighted_matrix[i] = pre_train.loc[word]
            words_found += 1
        except KeyError:
            weighted_matrix[i] = np.random.normal(size=50)
        if i % 1000 == 0:
            print("Finished {}th words".format(i))
    return weighted_matrix
def create_emb_layer(weighted_matrix1, non_trainable=False):
    """

    :param weighted_matrix1: tensor matrix
    :param non_trainable:
    :return: emb_layer type embedding
    """
    input_shape, embedding_dim = weighted_matrix1.shape
    emb_layer = nn.Embedding.from_pretrained(weighted_matrix1,
                                             padding_idx=input_shape - 1)
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer
def split_text(text_file):
    """

    :param text_file: training file
    :return: DIC, TOKENS and TAGS

    """
    with open(text_file, mode="r") as file:
        text_f = file.read()
        text_f_lst = text_f.split()
        file.close()
    keys, values = text_f_lst[::2], text_f_lst[1::2]
    result_dic = dict(zip(keys, values))
    return result_dic, keys, values
def prepare_seq(seq_list, dictionary):
    """
    embedding and padded a sequence, given its relating dictionary
    :return: padded sequence in numerical numbers
    """
    embedded = []
    for batch in seq_list:
        empty_lst = [dictionary[tag] for tag in batch]
        embedded.append(empty_lst)
    # convert to a list of tensor
    embedded = [torch.tensor(seq) for seq in embedded]
    padded = nn.utils.rnn.pad_sequence(embedded,
                                       batch_first=True,
                                       padding_value=dictionary['<PAD>'])
    return padded

In [28]:
class LSTM(nn.Module):
    def __init__(self, nb_layers, batch_size, nb_lstm_units, embedding_layer,
                 bidirectional=False,
                 dropout=0,
                 embedding_dim=50):
        super(LSTM, self).__init__()
        self.hidden_layer = None
        self.result_dic, self.words_lst, self.tags_lst = split_text("wsj1-18.training")
        self.vocab = dict(zip(sorted(set(self.words_lst)),
                              np.arange(len(set(self.words_lst)))))
        self.tags = dict(zip(sorted(set(self.tags_lst)),
                             np.arange(len(set(self.tags_lst)))))
        self.vocab['<PAD>'] = len(set(self.words_lst))
        self.tags['<PAD>'] = len(set(self.tags_lst))
        self.padding_idx = self.vocab['<PAD>']
        self.nb_layers = nb_layers
        self.batch_size = batch_size
        self.nb_lstm_units = nb_lstm_units
        self.embedding_dim = embedding_dim
        self.embedding_layer = embedding_layer
        self.bidirectional = bidirectional
        self.dropout = dropout if nb_layers > 1 else 0
        self.dropout_layer = nn.Dropout(self.dropout)
        # don't count the pad for the tags
        self.nb_tags = len(self.tags) - 1

        # build actual NN
        self.__build_model()

    def __build_model(self):
        # design LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            batch_first=True,
            num_layers=self.nb_layers,
            bidirectional=self.bidirectional,
            dropout=self.dropout
        )

        # output layer which project back to tag space
        self.hidden_to_tag = nn.Linear(self.nb_lstm_units * 2
                                       if self.bidirectional
                                       else self.nb_lstm_units
                                       , self.nb_tags)

    def forward(self, input):
        # init hidden layers and input sequence length
        h0 = torch.rand(self.nb_layers, input.size(0), self.nb_lstm_units)
        c0 = torch.rand(self.nb_layers, input.size(0), self.nb_lstm_units)
        input_lengths = torch.all(input != self.padding_idx, dim=2)\
            .sum(dim=1).flatten()

        # -------------------
        # 1. embed the input
        # Dim transformation: (batch_size, seq_len, 1) -> (batch_size, seq_len,
        # embedding_dim)
        input = self.dropout_layer(self.embedding_layer(input))
        input = input.squeeze(2)
        # -------------------
        # 2.  Run through LSTM
        # Dim transformation: (B,L, embedding_dim) -> (B, L, LSTM_units)
        input = torch.nn.utils.rnn.pack_padded_sequence(input,
                                                        input_lengths,
                                                        batch_first=True,
                                                        enforce_sorted=False)
        # now run through LSTM
        input = input.float()
        out, (h0, c0) = self.lstm(input, (h0, c0))  # undo the packing operation
        out, len_unpacked = nn.utils.rnn.pad_packed_sequence(out,
                                                             batch_first=True)
        # -------------------
        # 3.  Apply FC linear layer
        # linear layer
        out = out.view(-1,
                       out.size(-1))  # (batch_size, seq_len, nb_lstm_units) -> (batch_size * seq_len, nb_lstm_units)
        out = self.hidden_to_tag(out)  # (batch_size * seq_len, nb_lstm_units) -> (batch_size * seq_len, nb_tags)

        # reshape into (batch_size,  seq_len, nb_lstm_units)
        out = out.view(self.batch_size, -1, self.nb_tags)
        # -------------------
        # 4.  softmax to transfer it to probability
        Y_hat = F.log_softmax(out.float(), dim=2)
        return Y_hat

    def loss(self, Y_hat, Y):
        # NLL(tensor log_softmax output, target index list)
        # flatten out all labels
        Y = prepare_seq(Y, self.tags)
        Y = Y.flatten()
        # flatten all predictions
        Y_hat = Y_hat.view(-1, len(self.tags) - 1)
        # create a mask that filter '<PAD>;
        tag_token = self.tags['<PAD>']
        mask = (Y < tag_token)
        mask_idx = torch.nonzero(mask.float())
        Y_hat = Y_hat[mask_idx].squeeze(1)
        Y = Y[mask_idx].squeeze(1)
        loss = nn.NLLLoss()
        result = loss(Y_hat, Y)
        return result

LSTM model

In [163]:
### transform text into list of words
df = pd.read_pickle('glove.pkl')
_, words_lst, tags_lst = split_text("wsj1-18.training")
tags = dict(zip(sorted(set(tags_lst)), np.arange(len(set(tags_lst)))))
tags['<PAD>'] = len(tags)
weighted_matrix = torch.load("weighed_matrix.pt")
embedding_layer_const = create_emb_layer(weighted_matrix)
nb_layers = 2
nb_lstm_units = 32
batch_size = 3
seq_len = 4
padding_idx = 912344
toy_training = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

In [172]:
batch_in = torch.tensor([[[4],
                          [5],
                          [912344],
                          [912344]],

                         [[6],
                          [912344],
                          [912344],
                          [912344]],

                         [[7],
                          [8],
                          [9],
                          [10]]])
Y = [["CC", "CD", "DT"], ["EX"], ["JJ", "IN", "JJ", "JJR"]]
model = LSTM(nb_layers=nb_layers,
             batch_size=32,
             nb_lstm_units=nb_lstm_units,
             embedding_layer=embedding_layer_const,
             bidirectional=False)
iter1 = iter(train_loader)
out = model(features)

AttributeError: 'tuple' object has no attribute 'size'

In [166]:
out.shape

torch.Size([3, 4, 45])

In [167]:
out.max(dim=2, keepdim=True)[1]

tensor([[[32],
         [32],
         [32],
         [20]],

        [[13],
         [14],
         [14],
         [20]],

        [[13],
         [14],
         [14],
         [14]]])

In [127]:
out = model(batch_in)
# out = out.max(dim=2, keepdim=True)[1]
Y = [["CC", "CD", "DT"], ["EX"], ["JJ", "IN", "JJ", "JJR"]]
# NLL(tensor log_softmax output, target index list)
# flatten out all labels
Y = prepare_seq(Y, tags)
Y = Y.flatten()
# flatten all predictions
out = out.view(-1, len(tags) - 1)
# create a mask that filter '<PAD>;
tag_token = tags['<PAD>']
mask = (Y < tag_token)
mask_idx = torch.nonzero(mask.float())
out = out[mask_idx].squeeze(1)
Y = Y[mask_idx].squeeze(1)
loss = nn.NLLLoss()
result = loss(out, Y)

In [129]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

class SquareDataset(Dataset):
     def __init__(self, a=0, b=1):
         super(Dataset, self).__init__()
         assert a <= b
         self.a = a
         self.b = b

     def __len__(self):
         return self.b - self.a + 1

     def __getitem__(self, index):
        assert self.a <= index <= self.b
        return index, index**2

data_train = SquareDataset(a=1,b=64)
data_train_loader = DataLoader(data_train, batch_size=64, shuffle=True)
print(len(data_train))

64


In [190]:
from torch.utils.data import Dataset, DataLoader
class TagDataset(Dataset):
    def __init__(self, train=True):
        """

        :param train: bool, if True read training data, else read testing data
        """
        self.train = train
        if self.train:
            with open("wsj1-18.training", mode="r") as file:
                self.text_f = file.read()
                self.text_f_lst = self.text_f.splitlines()
                file.close()
        else:
            with open("wsj19-21.truth", mode="r") as file:
                self.text_f = file.read()
                self.text_f_lst = self.text_f.splitlines()
                file.close()

    def __len__(self):
        return len(self.text_f_lst)

    def __getitem__(self, item):
        return self.text_f_lst[item].split()[0::2], self.text_f_lst[item].split()[1::2]

from torch.nn.utils.rnn import pad_sequence

In [None]:
vocab = dict(zip(sorted(set(words_lst)), np.arange(len(set(words_lst)))))
vocab['<PAD>'] = 912344
def collate_fn(batch):
    list_sentence = [[vocab[word] for word in sentence] for sentence in batch]
    length_list = [len(sentence) for sentence in list_sentence]
    max_length = max(length_list)
    batch_size = len(list_sentence)
    pad_token = vocab['<PAD>']
    # init tensors of ones with batch_size * max_length
    result = np.ones((batch_size, max_length)) * pad_token
    # populate the result
    for i, length in enumerate(length_list):
        sequence = list_sentence[i]
        result[i][0:length] = sequence
    return torch.from_numpy(result)
toy_training = [
    "The dog ate the apple".split(),
    "Everybody read that book".split()]
collate_fn(toy_training)



KeyboardInterrupt: 

In [211]:

batch_size = 32

# todo: prepare dataloader for text
train_loader = DataLoader(TagDataset(train=True),
                          batch_size=batch_size,
                          collate_fn=collate_fn,
                          shuffle=True)
test_loader = DataLoader(TagDataset(train=False),
                         batch_size=batch_size,
                         collate_fn=collate_fn,
                         shuffle=True)

In [191]:
iter1 = iter(train_loader)

In [192]:
x, y = next(iter1)

TypeError: expected Tensor as element 0 in argument 0, but got tuple

In [226]:
    num = np.arange(51)
    num_str = list(map(str, num))
    list_name = list(map(lambda x: "dim_" + x, num_str))
    df = pd.read_csv("glove.6B.50d.txt", sep=" ", quoting=csv.QUOTE_NONE,
                     header=None, encoding='utf-8',
                     names=list_name)
    df.rename({'dim_0': 'token'}, axis=1, inplace=True)
    words = df.token.to_list()

In [227]:
df.loc['<PAD>'] = np.zeros(50)

ValueError: cannot set a row with mismatched columns

In [229]:
df.set_index('token', inplace=True)

In [231]:
df

Unnamed: 0_level_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_41,dim_42,dim_43,dim_44,dim_45,dim_46,dim_47,dim_48,dim_49,dim_50
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,0.418000,0.249680,-0.41242,0.121700,0.34527,-0.044457,-0.496880,-0.178620,-0.000660,-0.656600,...,-0.298710,-0.157490,-0.347580,-0.045637,-0.442510,0.187850,0.002785,-0.184110,-0.115140,-0.785810
",",0.013441,0.236820,-0.16899,0.409510,0.63812,0.477090,-0.428520,-0.556410,-0.364000,-0.239380,...,-0.080262,0.630030,0.321110,-0.467650,0.227860,0.360340,-0.378180,-0.566570,0.044691,0.303920
.,0.151640,0.301770,-0.16763,0.176840,0.31719,0.339730,-0.434780,-0.310860,-0.449990,-0.294860,...,-0.000064,0.068987,0.087939,-0.102850,-0.139310,0.223140,-0.080803,-0.356520,0.016413,0.102160
of,0.708530,0.570880,-0.47160,0.180480,0.54449,0.726030,0.181570,-0.523930,0.103810,-0.175660,...,-0.347270,0.284830,0.075693,-0.062178,-0.389880,0.229020,-0.216170,-0.225620,-0.093918,-0.803750
to,0.680470,-0.039263,0.30186,-0.177920,0.42962,0.032246,-0.413760,0.132280,-0.298470,-0.085253,...,-0.094375,0.018324,0.210480,-0.030880,-0.197220,0.082279,-0.094340,-0.073297,-0.064699,-0.260440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chanty,0.232040,0.025672,-0.70699,-0.045465,0.13989,-0.628070,0.726250,0.341080,0.446140,0.163290,...,-0.095526,-0.296050,0.385670,0.136840,0.593310,-0.694860,0.124100,-0.180690,-0.258300,-0.039673
kronik,-0.609210,-0.672180,0.23521,-0.111950,-0.46094,-0.007462,0.255780,0.856320,0.055977,-0.237920,...,0.672050,-0.598220,-0.202590,0.392430,0.028873,0.030003,-0.106170,-0.114110,-0.249010,-0.120260
rolonda,-0.511810,0.058706,1.09130,-0.551630,-0.10249,-0.126500,0.995030,0.079711,-0.162460,0.564880,...,0.024747,0.200920,-1.085100,-0.136260,0.350520,-0.858910,0.067858,-0.250030,-1.125000,1.586300
zsombor,-0.758980,-0.474260,0.47370,0.772500,-0.78064,0.232330,0.046114,0.840140,0.243710,0.022978,...,0.454390,-0.842540,0.106500,-0.059397,0.090449,0.305810,-0.614240,0.789540,-0.014116,0.644800
