In [1]:
import unicodedata
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import numpy as np

USE_CUDA = True
SOS_token = 0
EOS_token = 1
UNK_token = 2
MAX_VOCAB_DIM = 50000
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.default_vocab = {0: "<SOS>", 1: "<EOS>", 2:'<UNK>'}
        self.index2word = self.default_vocab.copy()
        self.n_words = 3 # Count SOS and EOS
    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    def trim_vocab(self, max_vocab_dim):
        start_idx = len(self.default_vocab)
        kv = np.array([(k,v) for k,v in self.word2count.items() if k not in self.default_vocab])
        sorted_count_idx = np.flip(np.argsort(np.array(kv[:,1], dtype=np.int32)), axis=0)
        self.index2word = self.default_vocab.copy()
        self.word2index = {v:k for k,v in self.index2word.items()}
        i = start_idx
        
        for word in np.array(kv[:,0])[sorted_count_idx[:max_vocab_dim-len(self.default_vocab)]]:
            self.index2word[i] = word
            self.word2index[word] = i
            i += 1
        self.n_words = len(self.index2word)
        self.word2count = {k:v for k,v in self.word2count.items() if k in self.word2index}
            
                
            

def normalize_string(s):
#     s = unicode_to_ascii(s.lower().strip())
    s = s.strip()
#     print s,'start'
    s = re.sub(u"([.!?])", u" \1", s)
#     s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(u"[^\u4e00-\u9fffa-zA-Z.!?0-9]+", r" ", s)
#     print s,'end'
    return s
def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('../data/%s-%s.txt' % (lang1, lang2)).read().strip().split('\n')
    
    # Split every line into pairs and normalize
    pairs = [[normalize_string(s.decode('utf8')) for s in l.split('\t')] for l in lines]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs
MAX_LENGTH = 60

good_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re "
)

def filter_pair(p):
#     return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH and \
#         p[1].startswith(good_prefixes)
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]
def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    pairs = filter_pairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

    return input_lang, output_lang, pairs

# input_lang, output_lang, pairs = prepare_data('eng', 'fra', True)
# input_lang, output_lang, pairs = prepare_data('eng', 'cmn', True)
input_lang, output_lang, pairs = prepare_data('r1', 'r2', True)
# Print an example pair
input_lang.trim_vocab(MAX_VOCAB_DIM)
output_lang.trim_vocab(MAX_VOCAB_DIM)
print(random.choice(pairs))
a,b = random.choice(pairs)
print a
print b
# Return a list of indexes, one for each word in the sentence
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_token for word in sentence.split(' ')]

def variable_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    var = Variable(torch.LongTensor(indexes).view(-1, 1))
#     print('var =', var)
    if USE_CUDA: var = var.cuda()
    return var

def variables_from_pair(pair):
    input_variable = variable_from_sentence(input_lang, pair[0])
    target_variable = variable_from_sentence(output_lang, pair[1])
    return (input_variable, target_variable)

Reading lines...
Read 651339 sentence pairs
Trimmed to 651293 sentence pairs
Indexing words...
[u'SOS \u4f60 \u7ed9 \u6211 \u7684 \u7231 \u4e5f\u8bb8 \u4e0d \u5b8c\u7f8e EOS c d a NOP ei NOE 3 NOR', u'SOS \u4f46 \u5374 \u6700\u7f8e EOS']
SOS 承受 之后 更 相畏 EOS v r v ul r r d v NOP ui NOE 8 NOR
SOS 合 我 选择 了 你 我 从不 后悔 EOS


# Attention the models

In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        
    def forward(self, word_inputs, hidden):
        # Note: we run this all at once (over the whole input sequence)
        seq_len = len(word_inputs)
        embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
        if USE_CUDA: hidden = hidden.cuda()
        return hidden
class Attn(nn.Module):
    def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = len(encoder_outputs)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
        if USE_CUDA: attn_energies = attn_energies.cuda()

        # Calculate energies for each encoder output
        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[i])

        # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
        return F.softmax(attn_energies, dim=-1).unsqueeze(0).unsqueeze(0)
    
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
#             print hidden.shape, energy.shape, energy.transpose(0,1).shape, 'lolo'
            energy = torch.mm(hidden,energy.transpose(0,1))
            return energy
        
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.other.dot(energy)
            return energy
        
class AttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        
        # Keep parameters for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        
        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size * 2, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)
    
    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
        
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
        
        # Combine embedded input word and last context, run through RNN
        rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
        rnn_output, hidden = self.gru(rnn_input, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
        attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
        
        # Final output layer (next word prediction) using the RNN hidden state and context vector
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)), dim=-1)
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, context, hidden, attn_weights

def evaluate(sentence, max_length=MAX_LENGTH):
    input_variable = variable_from_sentence(input_lang, sentence)
    input_length = input_variable.size()[0]
    
    # Run through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([[SOS_token]])) # SOS
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()

    decoder_hidden = encoder_hidden
    
    decoded_words = []
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)

        # Choose top word from output
        topv, topi = decoder_output.data.topk(3)
        for i in range(3):
            ni = topi[0][i]
            if ni != UNK_token:
                break
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[ni.item()])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([[ni]]))
        if USE_CUDA: decoder_input = decoder_input.cuda()
    return decoded_words, None
encoder = torch.load('./models/encoder-light.pl')
decoder = torch.load('./models/decoder-light.pl')
encoder.eval()
decoder.eval()


AttnDecoderRNN(
  (embedding): Embedding(50000, 128)
  (gru): GRU(256, 128, dropout=0.05)
  (out): Linear(in_features=256, out_features=50000, bias=True)
  (attn): Attn(
    (attn): Linear(in_features=128, out_features=128, bias=True)
  )
)

# Attention Linear models

In [2]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.d_model = hidden_size
        self.init_pos_mat(MAX_LENGTH*2)
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        
        self.embedding = nn.Embedding(input_size, hidden_size)
#         self.linear = nn.Linear(hidden_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        
    def forward(self, word_inputs, hidden):
        # Note: we run this all at once (over the whole input sequence)
        seq_len = len(word_inputs)
        embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
        embedded = embedded + self.get_pos_mat(seq_len).view(seq_len, 1, -1)
#         embedded = F.relu(self.linear(embedded))
        output, hidden = self.gru(embedded, hidden)
        return embedded, hidden

    def init_hidden(self):
        hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
        if USE_CUDA: hidden = hidden.cuda()
        return hidden
    
    def init_pos_mat(self, cache_length):
        print('init postional matrix with length : %d ' % cache_length)
        self.positional_matrix = torch.cat([positional_encoding(self.d_model, i) for i in range(0,cache_length)], dim=0)
        self.positional_matrix.requires_grad = False
        self.positional_matrix = self.positional_matrix.cuda()
            
        
    def get_pos_mat(self, length):
        if length > self.positional_matrix.shape[0]:
            print('input sequence length reach positional matrix maximum length. %d ' % length)
            ret = torch.cat([positional_encoding(self.d_model, i) for i in range(length)], dim=0)
            ret.requires_grad = False
            print('Increase positional matrix maximum length. %d ' % length)
            self.positional_matrix = ret
            self.positional_matrix = self.positional_matrix.cuda()
            return ret
        else:
            return self.positional_matrix[:length]
class Attn(nn.Module):
    def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = len(encoder_outputs)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
        if USE_CUDA: attn_energies = attn_energies.cuda()

        # Calculate energies for each encoder output
        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[i])

        # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
        return F.softmax(attn_energies, dim=-1).unsqueeze(0).unsqueeze(0)
    
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
#             print hidden.shape, energy.shape, energy.transpose(0,1).shape, 'lolo'
            energy = torch.mm(hidden,energy.transpose(0,1))
            return energy
        
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.other.dot(energy)
            return energy

class LinearAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(LinearAttnDecoderRNN, self).__init__()
#         
        self.d_model = hidden_size
        self.init_pos_mat(MAX_LENGTH*2)
        # Keep parameters for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        
        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
#         self.linear = nn.Linear(hidden_size, hidden_size)
#         self.lin1 = nn.Linear(hidden_size*2, hidden_size)
        self.out = nn.Linear(hidden_size * 2 , output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)
    
    def forward(self, last_context, last_hidden, encoder_emb, decoder_seq):
        # Get the embedding of the current input word (last output word)
        seq_len, _ = decoder_seq.size()
#         decoder_emb = self.embedding(decoder_seq).view(seq_len, 1, -1)
        decoder_emb = self.embedding(decoder_seq).view(seq_len, 1, -1) + self.get_pos_mat(seq_len).view(seq_len, 1, -1)
#         decoder_emb = F.relu(self.linear(decoder_emb))
        word_embedded = decoder_emb[-1,:].view(1, 1, -1)
        
        seq_embedded = torch.cat([encoder_emb,decoder_emb], dim=0)
        rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), dim=2)
        rnn_output, hidden = self.gru(rnn_input, last_hidden)
        
        # Combine embedded input word and last context, run through RNN
#         hidden = F.relu(self.lin1(torch.cat([word_embedded, last_context], 1)))

        # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
        attn_weights = self.attn(rnn_output.squeeze(0), seq_embedded)
        context = attn_weights.bmm(seq_embedded.transpose(0, 1)) # B x 1 x N
        
        word_embedded = word_embedded.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        
        # Final output layer (next word prediction) using the RNN hidden state and context vector
        output = F.log_softmax(self.out(torch.cat([context,word_embedded], dim=-1)), dim=-1)
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, context, hidden, attn_weights
    
    #     To speed up the positional encoding by construct an cache matrix. 
    def init_pos_mat(self, cache_length):
        print('init postional matrix with length : %d ' % cache_length)
        self.positional_matrix = torch.cat([positional_encoding(self.d_model, i) for i in range(0,cache_length)], dim=0)
        self.positional_matrix.requires_grad = False
        self.positional_matrix = self.positional_matrix.cuda()
            
        
    def get_pos_mat(self, length):
        if length > self.positional_matrix.shape[0]:
            print('input sequence length reach positional matrix maximum length. %d ' % length)
            ret = torch.cat([positional_encoding(self.d_model, i) for i in range(length)], dim=0)
            ret.requires_grad = False
            print('Increase positional matrix maximum length. %d ' % length)
            self.positional_matrix = ret
            self.positional_matrix = self.positional_matrix.cuda()
            return ret
        else:
            return self.positional_matrix[:length]
def evaluate(sentence, max_length=MAX_LENGTH):
    input_variable = variable_from_sentence(input_lang, sentence)
    input_length = input_variable.size()[0]
    
    # Run through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_embs, encoder_hidden  = encoder(input_variable.cuda(), encoder_hidden.cuda())

    # Create starting vectors for decoder
    decoder_input = decoder_seq = Variable(torch.LongTensor([[SOS_token]])) # SOS
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
    if USE_CUDA:
        decoder_input = decoder_seq = decoder_input.cuda()
        decoder_context = decoder_context.cuda()

    decoder_hidden = encoder_hidden
    
    decoded_words = []
    decoder_attentions = torch.zeros(max_length, 2*max_length)
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_context, decoder_hidden, encoder_embs, decoder_seq)
        decoder_attentions[di,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data

        # Choose top word from output
        topv, topi = decoder_output.data.topk(3)
        for i in range(3):
            ni = topi[0][i]
            if ni != UNK_token:
                break
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[ni.item()])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([[ni]]))
        if USE_CUDA: decoder_input = decoder_input.cuda()
        decoder_seq = torch.cat([decoder_seq, decoder_input], dim=0)
        
    return decoded_words, decoder_attentions[:di+1, :len(encoder_embs)+di+1]
encoder = torch.load('./models/encoder-linear.pl')
decoder = torch.load('./models/decoder-linear.pl')
encoder.eval()
decoder.eval()

LinearAttnDecoderRNN(
  (embedding): Embedding(50000, 128)
  (gru): GRU(256, 128, num_layers=2, dropout=0.05)
  (out): Linear(in_features=256, out_features=50000, bias=True)
  (attn): Attn(
    (attn): Linear(in_features=128, out_features=128, bias=True)
  )
)

### Testing

In [13]:
from tqdm import tqdm

def testing(sentence):
        
    output_words = evaluate(sentence)[0]
    s = ''
    for y in output_words:
        s += y + ' '
    return s.replace('<EOS>','').replace('SOS','').replace('EOS','').strip()
path = '../hw3_1/all/test.csv'
out_path = '../result_light.csv'
with open(path,'r') as f:
    ls = f.readlines()

with open(out_path,'w') as f:
    with torch.no_grad():
    
        with tqdm(total=len(ls)) as pbar:
            for l in ls:
                l = l.decode('utf8').strip()
                s = testing(l)+'\n'
                f.write(s.encode('utf8'))
                pbar.update(1)
print 'done'
s = testing(u"SOS 密密麻麻 是 我 的 自尊 EOS v m n NOP en NOE 3 NOR")


100%|██████████| 70000/70000 [17:27<00:00, 66.81it/s] 

done





In [9]:
# from tqdm import tqdm

# c = 0
# def testing(sentence):
        
#     while True:
#         try:
#             output_words = evaluate(sentence)
#             break
#         except KeyError,e:
#             global c
#             c += 1
#             sub = input_lang.index2word[np.random.randint(input_lang.n_words)]
#             sentence = sentence.replace(e.message,sub)
#     s = ''
#     for y in output_words:
#         s += y + ' '
#     return s.replace('<EOS>','').replace('SOS','').replace('EOS','').strip()
# path = '../hw3_1/all/test.csv'
# out_path = '../result_linear.csv'
# with open(path,'r') as f:
#     ls = f.readlines()

# with open(out_path,'w') as f:
#     with torch.no_grad():
    
#         with tqdm(total=len(ls)) as pbar:
#             for l in ls:
#                 l = l.decode('utf8').strip()
#                 s = testing(l)+'\n'
#                 f.write(s.encode('utf8'))
#                 pbar.update(1)
# print 'done', c
# s = testing(u"SOS 所以 我 好 愿意 EOS v r l NOP i NOE 3 NOR")


  0%|          | 0/70000 [00:00<?, ?it/s]


TypeError: can only concatenate list (not "str") to list