In [2]:
import unittest
import pickle, argparse, os, sys
from sklearn.metrics import accuracy_score
import csv
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.functional as F
import nltk
import os
def load_embedding(filename='glove.6B.50d.txt'):
    """
    Load embedding for the training and
    :return: dataframe, words
    """
    # creat column names
    num = np.arange(51)
    num_str = list(map(str, num))
    list_name = list(map(lambda x: "dim_" + x, num_str))
    df = pd.read_csv("glove.6B.50d.txt", sep=" ", quoting=csv.QUOTE_NONE, header=None, encoding='utf-8',
                     names=list_name)
    df.rename({'dim_0': 'token'}, axis=1, inplace=True)
    words = df.token.to_list()
    # add padding embedding
    df.loc['<PAD>'] = np.zeros(50)
    df.set_index('token', inplace=True)
    df.to_pickle("glove.pkl")
    return df, words
def word_to_embedding(target_vocab, pre_train):
    """

    :param pre_train: pd.DataFrame pre-trained dataframe
    :param target_vocab: list/ array of tokens need to be transformed
    :return: transformed matrix, result dictionary for the unique tokens
    """
    matrix_len = len(target_vocab)
    weighted_matrix = np.zeros((matrix_len + 1, 50))
    words_found = 0
    for i, word in enumerate(target_vocab):
        try:
            weighted_matrix[i] = pre_train.loc[word]
            words_found += 1
        except KeyError:
            weighted_matrix[i] = np.random.normal(size=50)
        if i % 1000 == 0:
            print("Finished {}th words".format(i))
    return weighted_matrix
def create_emb_layer(weighted_matrix, non_trainable=False):
    """

    :param weighted_matrix: tensor matrix
    :param non_trainable:
    :return: emb_layer, input_shape, embedding_dim
    """
    input_shape, embedding_dim = weighted_matrix.shape
    emb_layer = nn.Embedding.from_pretrained(num_embeddings=input_shape,
                             embedding_dim=embedding_dim,
                             padding_idx=input_shape - 1)
    print(input_shape, embedding_dim)
    emb_layer.load_state_dict({'weight': weighted_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, input_shape, embedding_dim
def split_text(text_file):
    """

    :param text_file: training file
    :return: DIC, TOKENS and TAGS

    """
    with open(text_file, mode="r") as file:
        text_f = file.read()
        text_f_lst = text_f.split()
        file.close()
    keys, values = text_f_lst[::2], text_f_lst[1::2]
    result_dic = dict(zip(keys, values))
    return result_dic, keys, values

LSTM model

In [None]:
class ToyLSTM(nn.Module):
    def __init__(self, nb_layers, batch_size, nb_lstm_units, embedding_dim=50, embedding_layer=embedding_layer_const):
        super(ToyLSTM, self).__init__()
        self.result_dic, self.words_lst, self.tags_lst = split_text("wsj1-18.training")
        self.vocab = dict(zip(sorted(set(self.words_lst)), np.arange(1, len(set(self.words_lst)) + 1)))
        self.tags = dict(zip(sorted(set(self.tags_lst)), np.arange(1, len(set(self.tags_lst)) + 1)))
        self.vocab['<PAD>'] = 0
        self.tags['<PAD>'] = 0

        self.nb_layers = nb_layers
        self.batch_size = batch_size
        self.nb_lstm_units = nb_lstm_units
        self.embedding_dim = embedding_dim
        self.embedding_layer = embedding_layer_const

        # don't count the pad for the tags
        self.nb_tags = len(self.tags) - 1

        # # when the model is bidirectional we double the output dimension
        # self.lstm

        # build actual NN
        self.__build_model()

    def __build_model(self):
        # first build the embedding layer
        # df_embedding = pd.read_pickle('glove.pkl')
        # weighted_matrix = word_to_embedding(target_vocab=self.words_lst,
        #                                     pre_train=df_embedding)
        # self.embedding_layer, _, __ = create_emb_layer(weighted_matrix)

        # design LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            batch_first=True,
            num_layers=self.nb_layers,
        )

        # output layer which project back to tag space
        self.hidden_to_tag = nn.Linear(self.nb_lstm_units, self.nb_tags)

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.random(self.nb_layers, self.batch_size, self.nb_lstm_units)
        hidden_b = torch.random(self.nb_layers, self.batch_size, self.nb_lstm_units)

        hidden_a, hidden_b = torch.autograd.Variable(hidden_a), torch.autograd.Variable(hidden_b)
        return hidden_a, hidden_b

    def forward(self, X, X_lengths):
        # reset the LSTM hidden state. Must be done before you run a new batch. Otherwise LSTM will treat a batch as a
        # continuation of a sequence

        self.hidden_layer = self.init_hidden()
        batch_size, seq_len, _ = X.size()

        # -------------------
        # 1. embed the input
        # Dim transformation: (batch_size, seq_len, 1) -> (batch_size, seq_len, embedding_dim)
        X = self.embedding_layer(X)

        # -------------------
        # 2.  Run through LSTM
        # Dim transformation: (B,L, embedding_dim) -> (B, L, LSTM_units)
        # pack padded items so that they are not shown to the LSTM
        ##X = torch.nn.utils.rnn.pack_padded_sequence(X, X_lengths, batch_first=True)

        # now run through LSTM
        ##X, self.hidden_layer = self.lstm(X, self.hidden_layer)

        # undo the packing operation
        ##X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        # -------------------
        # 3. Project to the tag space
        return X

In [2]:
# test with codes
df = pd.read_pickle('glove.pkl')
_, words_lst, __ = split_text("wsj1-18.training")
weighted_matrix = word_to_embedding(target_vocab=words_lst,
                                    pre_train=df)
weighted_matrix = torch.from_numpy(weighted_matrix)

Finished 0th words
Finished 1000th words
Finished 2000th words
Finished 3000th words
Finished 4000th words
Finished 5000th words
Finished 6000th words
Finished 7000th words
Finished 8000th words
Finished 9000th words
Finished 10000th words
Finished 11000th words
Finished 12000th words
Finished 13000th words
Finished 14000th words
Finished 15000th words
Finished 16000th words
Finished 17000th words
Finished 18000th words
Finished 19000th words
Finished 20000th words
Finished 21000th words
Finished 22000th words
Finished 23000th words
Finished 24000th words
Finished 25000th words
Finished 26000th words
Finished 27000th words
Finished 28000th words
Finished 29000th words
Finished 30000th words
Finished 31000th words
Finished 32000th words
Finished 33000th words
Finished 34000th words
Finished 35000th words
Finished 36000th words
Finished 37000th words
Finished 38000th words
Finished 39000th words
Finished 40000th words
Finished 41000th words
Finished 42000th words
Finished 43000th words
F

In [3]:
weighted_matrix = torch.load("weighed_matrix.pt")
weighted_matrix.shape

torch.Size([912345, 50])

In [5]:
weighted_matrix[912344]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], dtype=torch.float64)

In [None]:
embedding_layer_const, x, y = create_emb_layer(weighted_matrix)