In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
import gc
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk
from torch.utils.data import Dataset,TensorDataset, DataLoader, random_split

In [71]:
def get_chinese_embeddings(path, vocabulary, word2id, emb_size=300):
    w_emb = np.zeros((len(word2id), emb_size))
    with z.open(path) as f:
        for line in f:
            line = line.decode('utf-8')
            word = line.split()[0]
                
            if word in vocabulary: 
                try:
                    emb = np.array(line.strip('\n').split()[1:]).astype(np.float32)
                    w_emb[word2id[word]] +=emb     
                except:
                    continue      
    return w_emb   

# torchrua 真好用！！！用来pack. pad

In [117]:
from torchrua import reverse_packed_sequence, pad_packed_sequence, pack_padded_sequence


In [120]:
class ELMO(nn.Module):
    def __init__(self):
        super(ELMO, self).__init__()
        
        self.lamda = 1
        self.layer_weights = torch.randn(3)
        
        self.word_embeds = 300
        self.hidden_dim = 300
        
        self.lstm_forward_1 = nn.LSTM(self.word_embeds, self.hidden_dim,
                            num_layers=1, bidirectional=False, batch_first = True)
        self.lstm_forward_2 = nn.LSTM(self.word_embeds, self.hidden_dim,
                            num_layers=1, bidirectional=False, batch_first = True)
        self.lstm_backward_1 = nn.LSTM(self.word_embeds, self.hidden_dim,
                            num_layers=1, bidirectional=False, batch_first = True)
        self.lstm_backward_2 = nn.LSTM(self.word_embeds, self.hidden_dim,
                            num_layers=1, bidirectional=False, batch_first = True)
        
    def forward(self,sequence):
        
        # Get the fixed embeddings for the input batch of sequences
        embedding_sequence = self.get_embeddings(sequence)
        # the list to store the real length of each input
        lengths = []
        for i in range(batch_size):
            lengths.append(maskings[i,:].tolist().count(1))
        # pack the sequence to let LSTM ignoring padding
        pack_sequence = pack_padded_sequence(embedding_sequence, torch.Tensor(lengths).long(), batch_first=True)
        reversed_pack_sequence = reverse_packed_sequence(pack_sequence)
        # Randomly generate hidden_states and cell_states
        h1 = torch.randn(1, batch_size, self.hidden_dim)
        c1 = torch.randn(1, batch_size, self.hidden_dim)
        
        h2 = torch.randn(1, batch_size, self.hidden_dim)
        c2 = torch.randn(1, batch_size, self.hidden_dim)
        
        h3 = torch.randn(1, batch_size, self.hidden_dim)
        c3 = torch.randn(1, batch_size, self.hidden_dim)
        
        h4 = torch.randn(1, batch_size, self.hidden_dim)
        c4 = torch.randn(1, batch_size, self.hidden_dim)
        # Forward the input through 2-layer stacked and two directional LSTM, 1st layer
        packed_forward_output_1 = self.lstm_forward_1(embedding_sequence,(h1,c1))[0]
        packed_backward_output_1 = self.lstm_backward_1(torch.flip(embedding_sequence, dims=[1]),(h2,c2))[0]
        # unpack and repack
        forward_output_1_media = pad_packed_sequence(packed_forward_output_1, batch_first=True)
        backward_output_1_media = pad_packed_sequence(packed_backward_output_1, batch_first=True)
        forward_output_1 = pack_padded_sequence(forward_output_1_media, torch.Tensor(lengths).long(), batch_first=True)
        backward_output_1 = pack_padded_sequence(backward_output_1_media, torch.Tensor(lengths).long(), batch_first=True)
        # Residual adding
        forward_output_1 = torch.add(forward_output_1,embedding_sequence)
        backward_output_1 = torch.add(backward_output_1,torch.flip(embedding_sequence, dims=[1]))
        # Forward the input through 2nd layer
        packed_forward_output_2 = self.lstm_forward_2(forward_output_1,(h3,c3))[0]
        packed_backward_output_2 = self.lstm_backward_2(backward_output_1,(h4,c4))[0]
        # unpack
        forward_output_2 = pad_packed_sequence(packed_forward_output_2, batch_first=True)
        backward_output_2 = pad_packed_sequence(packed_backward_output_2, batch_first=True)
        # Get the outputs from the first LSTM layer and the second LSTM layer
        double_embedding = torch.cat((embedding_sequence,embedding_sequence),1)
        firstLayer_output = torch.cat((forward_output_1_media, torch.flip(backward_output_1_media, dims=[1])), 1)
        secondLayer_output = torch.cat((forward_output_2, torch.flip(backward_output_2, dims=[1])), 1)
        # Get the weighted sum of different part of word representations
        weights = nn.Softmax(self.layer_weights)
        weighted_representation = weights[0]*double_embedding+weights[1]*firstLayer_output,weights[2]*secondLayer_output
                
        return (weighted_representation,(double_embedding, firstLayer_output, secondLayer_output))
    
    
    def get_embeddings(self, sequence, embeddings):
        
        embeddings = torch.from_numpy(embeddings)
        emb_size = embeddings.size()[1]
        
        batch_size = sequence.size()[0]
        seq_length = sequence.size()[1]
        
        output = torch.zeros(batch_size,seq_length,emb_size)
        
        for i in range(batch_size):
            for j in range(seq_length):
                w_id = sequence[i,j]
                output[i,j,:]+= embeddings[w_id]
        
        return output


# Minor Test

In [72]:
a = np.array([[1,2,3],[4,5,6]])

In [73]:
a

array([[1, 2, 3],
       [4, 5, 6]])

In [74]:
t = torch.from_numpy(a)

In [84]:
t[0]

tensor([7, 8, 9])

In [85]:
t[0] = torch.Tensor([7,8,9])

In [78]:
t

tensor([[7, 8, 9],
        [4, 5, 6]])

In [79]:
len(t)

2

In [86]:
t[0]

tensor([7, 8, 9])

In [82]:
a = [1,2,3,4,5,6,7,8,9]

In [83]:
a[t[0,1]]

9

In [89]:
a = torch.Tensor([[1,2,3,0,0],[4,5,0,0,0],[6,0,0,0,0]])
l = torch.Tensor([3,2,1])
r = torch.nn.utils.rnn.pack_padded_sequence(a, l, batch_first=True)

In [90]:
r

PackedSequence(data=tensor([1., 4., 6., 2., 5., 3.]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None)

In [None]:
r = torch.nn.utils.rnn.pack_padded_sequence(a, l, batch_first=True)

In [91]:
torch.flip(a,dims = [1])

tensor([[0., 0., 3., 2., 1.],
        [0., 0., 0., 5., 4.],
        [0., 0., 0., 0., 6.]])

PackedSequence(data=tensor([0., 0., 0., 0., 0., 3.]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None)

In [94]:
g = torch.Tensor([[1,2,3],[4,5],[6]])

ValueError: expected sequence of length 3 at dim 1 (got 2)

In [96]:
import torchrua


ModuleNotFoundError: No module named 'torchrua'

In [97]:
pip install torchura

[31mERROR: Could not find a version that satisfies the requirement torchura (from versions: none)[0m
[31mERROR: No matching distribution found for torchura[0m
Note: you may need to restart the kernel to use updated packages.


In [99]:
from torch import pad_packed_sequence

ImportError: cannot import name 'pad_packed_sequence' from 'torch' (/Users/aooscar/anaconda/envs/py3/lib/python3.8/site-packages/torch/__init__.py)

In [100]:
torch.nn.utils.rnn.pack_padded_sequence

<function torch.nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=True)>

In [101]:
torch.nn.utils.rnn.reverse_packed_sequence

AttributeError: module 'torch.nn.utils.rnn' has no attribute 'reverse_packed_sequence'

In [112]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torchrua import reverse_packed_sequence, pad_packed_sequence, pack_padded_sequence
pad = pad_sequence([
    torch.arange(5) + 1,
    torch.arange(2) + 1,
    torch.arange(3) + 1,
], batch_first=True)
lengths = torch.tensor([5, 2, 3])

print(pad)
# tensor([[1, 2, 3, 4, 5],
#         [1, 2, 0, 0, 0],
#         [1, 2, 3, 0, 0]])


tensor([[1, 2, 3, 4, 5],
        [1, 2, 0, 0, 0],
        [1, 2, 3, 0, 0]])


In [113]:
pack = pack_padded_sequence(pad, lengths, batch_first=True)
reversed_pack = reverse_packed_sequence(pack)
reversed_pad, _ = pad_packed_sequence(reversed_pack, batch_first=True)

In [114]:
pack

PackedSequence(data=tensor([1, 1, 1, 2, 2, 2, 3, 3, 4, 5]), batch_sizes=tensor([3, 3, 2, 1, 1]), sorted_indices=tensor([0, 2, 1]), unsorted_indices=tensor([0, 2, 1]))

In [118]:
reversed_pack = reversed_pack+reversed_pack

In [116]:
reversed_pad

tensor([[5, 4, 3, 2, 1],
        [2, 1, 0, 0, 0],
        [3, 2, 1, 0, 0]])

In [119]:
reversed_pack

(tensor([5, 3, 2, 4, 2, 1, 3, 1, 2, 1]),
 tensor([3, 3, 2, 1, 1]),
 tensor([0, 2, 1]),
 tensor([0, 2, 1]),
 tensor([5, 3, 2, 4, 2, 1, 3, 1, 2, 1]),
 tensor([3, 3, 2, 1, 1]),
 tensor([0, 2, 1]),
 tensor([0, 2, 1]))