## Install torch

In [0]:
!pip install -q torch

tcmalloc: large alloc 1073750016 bytes == 0x59368000 @  0x7ff2844d22a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641


## Load Pacgages

In [0]:
import numpy as np
from collections import Counter
import pickle as pkl
import random
import pdb
import pandas as pd
import string
import re
import unicodedata
import os
import time
import math

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset
from torch.autograd import Variable
from torch.nn import functional

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
%matplotlib inline

#specify SOS() and EOS(end of sentence)
#specify maximum vocabulary size = 50000
PAD_IDX = 2
UNK_IDX = 3
SOS_token = 0
EOS_token = 1
MAX_VOCAB_SIZE = 800000
MAX_LENGTH = 50

train_en = 'data/train.tok.en'
train_zh = 'data/train.tok.zh'
val_en = 'data/dev.tok.en'
val_zh = 'data/dev.tok.zh'

In [2]:
#user GPU if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
  print("Currently using GPU")

Currently using GPU


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Load Embedding

In [0]:
folder_path = os.getcwd() + '/gdrive/My Drive/NLP_Project/'

In [0]:
import unicodedata
import re

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"<SOS>":0, "<EOS>":1, "<PAD>":2, "<UNK>":3}
        self.word2count = {}
        self.index2word = {0: "<SOS>", 1: "<EOS>", 2:"<PAD>", 3:"<UNK>"}
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines_lang1 = open(folder_path+'data/train.tok.{}'.format(lang1), encoding='utf-8').\
        read().strip().split('\n')
    lines_lang2 = open(folder_path+'data/train.tok.{}'.format(lang2), encoding='utf-8').\
        read().strip().split('\n')
    
    assert (len(lines_lang1)==len(lines_lang2))
    # Split every line into pairs and normalize
    pairs = [[lines_lang1[i], normalizeString(lines_lang2[i])] for i in range (len(lines_lang1))]
    #print (pairs[-1])
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [0]:
def load_emb_matrix(language):
    #load fasttext word vectors
    words_to_load = MAX_VOCAB_SIZE
    if language == 'english':
      file = 'wiki-news-300d-1M-subword.vec'
    if language == 'chinese':
      file = 'cc.zh.300.vec'
    

    with open(folder_path + 'data/' + file) as f:
        #remove the first line
        firstLine = f.readline()
        loaded_embeddings = np.zeros((words_to_load + 4, 300))
        words2id = {}
        idx2words = {}
        #ordered_words = []
        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i + 4 , :] = np.asarray(s[1:])
            words2id['<SOS>'] = SOS_token
            words2id['<EOS>'] = EOS_token
            words2id['<pad>'] = PAD_IDX
            words2id['<unk>'] = UNK_IDX
            words2id[s[0]] = i + 4
            
            idx2words[0] = '<SOS>'
            idx2words[1] = '<EOD>'
            idx2words[2] = '<pad>'
            idx2words[3] = '<unk>'
            
            idx2words[i + 4] = s[0]
   

    return words2id,idx2words,loaded_embeddings

In [0]:
def generate_weights_matrix(index2word_lang, word2index_lang, index2word_embed, word2index_embed, loaded_embeddings):
    emb_dim=300
    missing_count=0
    matrix_len = len(index2word_lang)
    weights_matrix = np.zeros((matrix_len, 300))
    
    for key in index2word_lang.keys():
        word=index2word_lang[key]
        if (word in word2index_embed.keys()):
          weights_matrix[key] = loaded_embeddings[word2index_embed[word]]
        else:
          missing_count=missing_count+1
          weights_matrix[key] = np.random.normal(scale=0.6, size=(emb_dim, ))
    print (missing_count)
    return weights_matrix

In [8]:
train_input_lang, train_output_lang, train_pairs = prepareData("zh", "en", reverse=False)

Reading lines...
Read 213376 sentence pairs
Trimmed to 193446 sentence pairs
Counting words...
Counted words:
zh 79368
en 45471


In [0]:
# words2id_eng,idx2words_eng,loaded_embeddings_eng = load_emb_matrix('english')
# words2id_zh,idx2words_zh,loaded_embeddings_zh = load_emb_matrix('chinese')

# pkl.dump(words2id_eng, open(folder_path + 'data/words2id_eng_1M.pkl', 'wb'))
# pkl.dump(idx2words_eng, open(folder_path +'data/idx2words_eng_1M.pkl', 'wb'))
# pkl.dump(loaded_embeddings_eng, open(folder_path +'data/embedding_matrix_eng_1M.pkl', 'wb'))

# pkl.dump(words2id_zh, open(folder_path + 'data/words2id_zh_1M.pkl', 'wb'))
# pkl.dump(idx2words_zh, open(folder_path + 'data/idx2words_zh_1M.pkl', 'wb'))
# pkl.dump(loaded_embeddings_zh, open(folder_path +'data/embedding_matrix_zh_1M.pkl', 'wb'))

words2id_eng=pkl.load(open(folder_path + 'data/words2id_eng_1M.pkl', 'rb'))
idx2words_eng=pkl.load(open(folder_path +'data/idx2words_eng_1M.pkl', 'rb'))
loaded_embeddings_eng=pkl.load(open(folder_path +'data/embedding_matrix_eng_1M.pkl', 'rb'))

words2id_zh=pkl.load(open(folder_path + 'data/words2id_zh_1M.pkl', 'rb'))
idx2words_zh=pkl.load(open(folder_path + 'data/idx2words_zh_1M.pkl', 'rb'))
loaded_embeddings_zh=pkl.load(open(folder_path +'data/embedding_matrix_zh_1M.pkl', 'rb'))

# #load embeding matrix
# words2id_eng = pkl.load(open(folder_path + 'data/words2id_eng.pkl', 'rb'))
# idx2words_eng = pkl.load(open(folder_path +'data/idx2words_eng.pkl', 'rb'))
# loaded_embeddings_eng= pkl.load(open(folder_path +'data/embedding_matrix_eng.pkl', 'rb'))


# words2id_zh = pkl.load(open(folder_path + 'data/words2id_zh.pkl', 'rb'))
# idx2words_zh = pkl.load(open(folder_path +'data/idx2words_zh.pkl', 'rb'))
# loaded_embeddings_zh= pkl.load(open(folder_path +'data/embedding_matrix_zh.pkl', 'rb'))

In [0]:
weights_matrix_eng=generate_weights_matrix(train_output_lang.index2word, train_output_lang.word2index, idx2words_eng, words2id_eng, loaded_embeddings_eng)
weights_matrix_eng = torch.from_numpy(weights_matrix_eng).to(device)


weights_matrix_zh=generate_weights_matrix(train_input_lang.index2word, train_input_lang.word2index, idx2words_zh, words2id_zh, loaded_embeddings_zh) 
weights_matrix_zh = torch.from_numpy(weights_matrix_zh).to(device)

6387
19084


In [0]:
# pkl.dump(weights_matrix_eng, open(folder_path +'data/weights_matrix_eng_1M.pkl', 'wb'))
# pkl.dump(weights_matrix_zh, open(folder_path +'data/weights_matrix_zh_1M.pkl', 'wb'))

weights_matrix_eng=pkl.load(open(folder_path +'data/weights_matrix_eng_1M.pkl', 'rb'))
weights_matrix_zh=pkl.load(open(folder_path +'data/weights_matrix_zh_1M.pkl', 'rb'))

In [0]:
words2id_eng["they"]

74

In [0]:
weights_matrix_eng[177]

tensor([ 0.0096,  0.0142, -0.0165, -0.0160,  0.0348, -0.0028, -0.0091, -0.0407,
         0.0240,  0.0095, -0.0168, -0.0538, -0.0003, -0.0560,  0.0284,  0.0146,
         0.0532,  0.0161,  0.0490, -0.0022, -0.0056, -0.0035,  0.0023,  0.0190,
        -0.0233, -0.0374,  0.0185, -0.0271,  0.0885, -0.0025, -0.0349, -0.0071,
        -0.0144,  0.0123, -0.0258, -0.0204, -0.0158, -0.0176, -0.0319, -0.0306,
        -0.0172, -0.0500, -0.0072,  0.0092,  0.0276, -0.0190,  0.0216, -0.0176,
        -0.0389, -0.0024,  0.0283, -0.0001, -0.0173, -0.0021, -0.0721,  0.0124,
        -0.0067, -0.0123, -0.0375, -0.0157,  0.0125, -0.0097,  0.0590, -0.0182,
         0.0177, -0.0003, -0.0033, -0.0108, -0.0134,  0.0026,  0.0035,  0.0074,
         0.0413,  0.0033,  0.0232,  0.0017,  0.0468, -0.0035, -0.0158, -0.0082,
        -0.0057,  0.0004,  0.0038,  0.0365, -0.0175, -0.0076,  0.0123, -0.0137,
        -0.0067, -0.0055, -0.0076,  0.0203, -0.0616, -0.0454,  0.0277, -0.0569,
         0.0300,  0.0189,  0.0033, -0.00

In [0]:
loaded_embeddings_eng[74]

array([ 9.600e-03,  1.420e-02, -1.650e-02, -1.600e-02,  3.480e-02,
       -2.800e-03, -9.100e-03, -4.070e-02,  2.400e-02,  9.500e-03,
       -1.680e-02, -5.380e-02, -3.000e-04, -5.600e-02,  2.840e-02,
        1.460e-02,  5.320e-02,  1.610e-02,  4.900e-02, -2.200e-03,
       -5.600e-03, -3.500e-03,  2.300e-03,  1.900e-02, -2.330e-02,
       -3.740e-02,  1.850e-02, -2.710e-02,  8.850e-02, -2.500e-03,
       -3.490e-02, -7.100e-03, -1.440e-02,  1.230e-02, -2.580e-02,
       -2.040e-02, -1.580e-02, -1.760e-02, -3.190e-02, -3.060e-02,
       -1.720e-02, -5.000e-02, -7.200e-03,  9.200e-03,  2.760e-02,
       -1.900e-02,  2.160e-02, -1.760e-02, -3.890e-02, -2.400e-03,
        2.830e-02, -1.000e-04, -1.730e-02, -2.100e-03, -7.210e-02,
        1.240e-02, -6.700e-03, -1.230e-02, -3.750e-02, -1.570e-02,
        1.250e-02, -9.700e-03,  5.900e-02, -1.820e-02,  1.770e-02,
       -3.000e-04, -3.300e-03, -1.080e-02, -1.340e-02,  2.600e-03,
        3.500e-03,  7.400e-03,  4.130e-02,  3.300e-03,  2.320e

In [0]:
# def generate_weights_matrix(idx2words,loaded_embeddings):
   
#     matrix_len = len(idx2words)
#     weights_matrix = np.zeros((matrix_len, 300))
    
#     for key in idx2words.keys():
        
#         try: 
#             weights_matrix[key]
#             loaded_embeddings[key]
#             weights_matrix[key] = loaded_embeddings[key]
#         except KeyError:
#             weights_matrix[key] = np.random.normal(scale=0.6, size=(emb_dim, ))
#     return weights_matrix

In [0]:
# words2id_eng,idx2words_eng,loaded_embeddings_eng = load_emb_matrix('english')
# words2id_zh,idx2words_zh,loaded_embeddings_zh = load_emb_matrix('chinese')


In [0]:
# weights_matrix_eng=generate_weights_matrix(idx2words,loaded_embeddings_eng)

35574


In [0]:
# weights_matrix_eng.shape

torch.Size([79366, 300])

In [0]:

# words2id_eng = output_lang.word2index
# idx2words_eng= output_lang.index2word
# loaded_embeddings_eng= pkl.load(open(folder_path +'data/embedding_matrix_eng.pkl', 'rb'))
# words2id_zh = input_lang.word2index
# idx2words_zh= input_lang.index2word
# loaded_embeddings_zh= pkl.load(open(folder_path +'data/embedding_matrix_zh.pkl', 'rb'))
# weights_matrix_eng = generate_weights_matrix(idx2words_eng,loaded_embeddings_eng)
# weights_matrix_zh = generate_weights_matrix(idx2words_zh,loaded_embeddings_zh)

In [0]:
# #weights_matrix_eng = generate_weights_matrix(idx2words_eng,loaded_embeddings_eng)
# #pkl.dump(weights_matrix_eng, open(folder_path + 'data/weights_matrix_eng.pkl', 'wb'))

# weights_matrix_eng=pkl.load(open(folder_path + 'data/weights_matrix_eng.pkl', 'rb'))
# weights_matrix_eng = torch.from_numpy(weights_matrix_eng).to(device)

# #weights_matrix_zh = generate_weights_matrix(idx2words_zh,loaded_embeddings_zh)
# #pkl.dump(weights_matrix_zh, open(folder_path + 'data/weights_matrix_zh.pkl', 'wb'))

# weights_matrix_zh=pkl.load(open(folder_path + 'data/weights_matrix_zh.pkl', 'rb'))
# weights_matrix_zh = torch.from_numpy(weights_matrix_zh).to(device)

In [0]:

# #define a class of language
# class Language:
#     def __init__(self, name,word2index,index2word):
#         self.name = name
#         self.word2index = word2index
#         #self.word2count = {}
#         self.index2word = index2word
#         self.n_words = len(word2index)

In [0]:
# # Turn a Unicode string to plain ASCII, thanks to
# # http://stackoverflow.com/a/518232/2809427
# def unicodeToAscii(s):
#     return ''.join(
#         c for c in unicodedata.normalize('NFD', s)
#         if unicodedata.category(c) != 'Mn'
#     )

# # Lowercase, trim, and remove non-letter characters

# def normalizeString(s):
#     s = s.replace(r"&quot;","")
#     s = s.replace(r"&apos;","'")
#     s = unicodeToAscii(s.strip())
#     s = re.sub(r"([.!?])", r" \1", s)
#     s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
#     return s
  
  
# def filterPair(p):
#     return len(p[0].split(' ')) < MAX_LENGTH and \
#         len(p[1].split(' ')) < MAX_LENGTH


# def filterPairs(pairs):
#     return [pair for pair in pairs if filterPair(pair)]

In [0]:
# #To read the data file we will split the file into lines, and then split lines into pairs. 

# def readLanguages_sample(input_lang,target_lang):
#     print("\nReading lines...")

#     # Read the file and split into lines
#     input_lines = open(folder_path + input_lang, encoding='utf-8').\
#         read().strip().split('\n')
#     target_lines = open(folder_path + target_lang, encoding='utf-8').\
#         read().strip().split('\n')

#     # Split every line and normalize
#     #for chinese input, strip the space at the begining and end of the sentence
#     #for english output, use normalizeString function
#     sample_input = input_lines[:10000]
#     sample_target = target_lines[:10000]
    
#     input_lines_norm = [l.strip() for l in sample_input]
#     target_lines_norm = [normalizeString(l) for l in sample_target]
    
#     #build pairs
#     #drop pair if both zh and en are empty strings
#     pairs = [[item[0],item[1]] for item in zip(input_lines_norm,target_lines_norm) if len(item[0])+len(item[1]) != 0]
    
#     input_lines = Language("zh")
#     target_lines = Language("en")

#     return input_lines, target_lines, pairs

In [0]:
# #To read the data file we will split the file into lines, and then split lines into pairs. 

# def readLanguages(input_lang,target_lang):
#     print("\nReading lines...")

#     # Read the file and split into lines
#     input_lines = open(folder_path + input_lang, encoding='utf-8').\
#         read().strip().split('\n')
#     target_lines = open(folder_path + target_lang, encoding='utf-8').\
#         read().strip().split('\n')

#     # Split every line and normalize
#     #for chinese input, strip the space at the begining and end of the sentence
#     #for english output, use normalizeString function
#     input_lines_norm = [l.strip() for l in input_lines]
#     target_lines_norm = [normalizeString(l) for l in target_lines]
    
#     #build pairs
#     #drop pair if both zh and en are empty strings
#     pairs = [[item[0],item[1]] for item in zip(input_lines_norm,target_lines_norm) if len(item[0])+len(item[1]) != 0]
    
#     input_lines = Language("zh",words2id_zh,idx2words_zh)
#     target_lines = Language("en",words2id_eng,idx2words_eng)

#     return input_lines, target_lines, pairs

In [0]:
# def prepareData(input_lang, target_lang):
#     input_lang, output_lang, pairs = readLanguages(input_lang, target_lang)
#     print("Read %s sentence pairs" % len(pairs))
#     pairs = filterPairs(pairs)
#     print("Trimmed to %s sentence pairs" % len(pairs))
#     print("Counting words...")
# #     for pair in pairs:
# #         input_lang.addSentence(pair[0])
# #         output_lang.addSentence(pair[1])
#     print("Counted words:")
#     print(input_lang.name, input_lang.n_words)
#     print(output_lang.name, output_lang.n_words)
    
#     return input_lang, output_lang, pairs

In [0]:
# train_input_lang, train_output_lang, train_pairs = prepareData(train_zh, train_en)
# print("print a random pair of training pairs:")
# print(random.choice(train_pairs))



# # val_input_lang, val_output_lang, val_pairs = prepareData(val_zh, val_en)
# # print("print a random pair of validation pairs:")
# # print(random.choice(val_pairs))


# # pkl.dump(train_input, open(folder_path +'data/train_input.pkl', 'wb'))
# # pkl.dump(train_output, open(folder_path +'data/train_output.pkl', 'wb'))
# # pkl.dump(train_pairs, open(folder_path +'data/train_pairs.pkl', 'wb'))
# # pkl.dump(val_input, open(folder_path +'data/val_input.pkl', 'wb'))
# # pkl.dump(val_output, open(folder_path +'data/val_output.pkl', 'wb'))
# # pkl.dump(val_pairs, open(folder_path +'data/val_pairs.pkl', 'wb'))


Reading lines...
Read 213237 sentence pairs
Trimmed to 195215 sentence pairs
Counting words...
Counted words:
zh 100004
en 100004
print a random pair of training pairs:
['科技 让 巨大 的 创造 创造力 释放 释放出 释放出来 放出 出来', 'And this has unleashed tremendous energy .']


## Data Loader

In [0]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_IDX for word in sentence.split(' ')] + [EOS_token]

In [0]:
BATCH_SIZE = 64

class VocabDataset(Dataset):
    """
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, pairs,input_language, output_language):
        """
        @param pairs: pairs of input and target sentences(raw text sentences)
        @param input_language: Class Lang of input languages (zh in this case)
        @param output_language: Class Lang of output languages (en in this case)

        """
        self.pairs = pairs
        self.inputs = [pair[0] for pair in pairs]
        self.input_lang = input_language
        self.output_lang = output_language
        self.outputs = [pair[1] for pair in pairs]
        
        
        #assert self.input_lang == self.target_lang
       
    def __len__(self):
         return len(self.pairs)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        #turn raw text sentecens into indices
        input_ = indexesFromSentence(self.input_lang, self.inputs[key])
        output = indexesFromSentence(self.output_lang, self.outputs[key])
        #print (output)
        #print both the length of the source sequence and the target sequence
        return [input_,len(input_),output,len(output)]
    
    
    def __gettext__(self,key):
      return [self.inputs[key],self.outputs[key]]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    input_data_list = []
    output_data_list = []
   
    
    for datum in batch:
      input_data_list.append(datum[0])
      output_data_list.append(datum[2])
      
      
    # Zip into pairs, sort by length (descending), unzip
    seq_pairs = sorted(zip(input_data_list, output_data_list), key=lambda p: len(p[0]), reverse=True)
    input_seqs, output_seqs = zip(*seq_pairs)
    
    #store the length of the sequences 
    input_data_len = [len(p) for p in input_seqs]
    output_data_len = [len(p) for p in output_seqs]
    
    #padding
    padded_vec_input = [np.pad(np.array(p),
                                 pad_width=((0,MAX_LENGTH-len(p))),
                                 mode="constant", constant_values=PAD_IDX) for p in input_seqs]
        
    padded_vec_output = [np.pad(np.array(p),
                                 pad_width=((0,MAX_LENGTH-len(p))),
                                 mode="constant", constant_values=PAD_IDX) for p in output_seqs]      
    
    
    input_var = Variable(torch.LongTensor(padded_vec_input))
    output_var = Variable(torch.LongTensor(padded_vec_output))
    input_data_len = Variable(torch.LongTensor(input_data_len))
    output_data_len = Variable(torch.LongTensor(output_data_len))
    
    
    return [input_var,input_data_len,output_var,output_data_len]

In [0]:
# Build train and valid dataloaders

train_dataset = VocabDataset(train_pairs,train_input_lang, train_output_lang)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True,
                                           drop_last = True)


# val_dataset = VocabDataset(val_pairs,val_input_lang,val_output_lang)
# val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=True)

In [0]:
EOS_token

1

In [0]:
train_dataset[0]

[[4, 5, 6, 7, 8, 8, 9, 8, 8, 10, 8, 1], 12, [4, 5, 6, 7, 8, 1], 6]

In [0]:
train_pairs[0]

['深海 海中 的 生命   大卫   盖罗 ', 'life in the deep oceans']

In [0]:
for i in train_loader:
  [input_var,input_data_len,output_var,output_data_len]=i
  break

In [0]:
input_var[2]

tensor([ 1300,     6,    12,     8,     8, 17908,    58,  1228,   251,  1252,
          351,    15,   241,     6,  1104,    74,  1927,   344,  4744,     8,
            8,   117,     8,     8,    15,    85,   828, 36616,   946, 23088,
        16654,     8,     8,  3180, 10683,    15,   355,   773,   659,   260,
          326,     8,     1,     2,     2,     2,     2,     2,     2,     2])

In [0]:
train_input_lang.index2word[960]

'玩耍'

In [0]:
for x in input_var[0]:
  print (train_input_lang.index2word[x.data.tolist()])

但是
与此
与此同时
同时
可能
有


你
不能
预测
的
变化


比如



飞机
需要
更多
的
跑道
去
起飞



因为
高温


不
稠密
的
空气


不能
提供
足够
的
升力


<EOS>
<PAD>
<PAD>
<PAD>


In [0]:
output_var[0]

tensor([18940,   959,   148,   388,   108, 17072,   959,   220,    15,   221,
          166,    19,   148,    14,    60,    96, 18930,  1139,    38,    52,
            6,   111,   388,   108,  3753,   133,  8401,  1854, 18959,  9982,
          121,    46,  2396,    96,    75,    15,    39,   231, 15262,     5,
            6,   347,    13,     1,     2,     2,     2,     2,     2,     2])

In [0]:
for x in output_var[0]:
  print (train_output_lang.index2word[x.data.tolist()])

ao
who
also
was
a
pianist
who
couldn
apos
t
see
and
also
i
think
like
derek
thought
that
all
the
world
was
a
piano
so
whenever
art
tatum
plays
something
it
sounds
like
there
apos
s
three
pianos
in
the
room
.
<EOS>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>


## Encoder

In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, weights_matrix, input_size, hidden_size,n_layers=1):
        super(EncoderRNN, self).__init__()
     
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.n_layers = n_layers
        self.batch_size = BATCH_SIZE
        self.num_embeddings, self.embedding_dim = weights_matrix.size()
        
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        self.embedding.from_pretrained(weights_matrix, freeze=False, sparse=False)
        #self.embedding.weight.requires_grad = True

        
        self.gru = nn.GRU(self.embedding_dim, hidden_size, n_layers, bidirectional=True)
        

    def forward(self, input_seqs, input_len, hidden=None):

       
        embedded = self.embedding(input_seqs)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_len)
        output, hidden = self.gru(packed, hidden)

        output, output_len = torch.nn.utils.rnn.pad_packed_sequence(output)
        output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:]
        
        return output,hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

## Decoder

In [0]:
class DecoderRNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, output_size,n_layers=1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.batch_size = BATCH_SIZE
        self.num_embeddings, self.embedding_dim = weights_matrix.size()
        
        #self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
#         self.embedding.weight.data.copy_(weights_matrix)
#         self.embedding.weight.requires_grad = True
        #self.embedding.from_pretrained(weights_matrix, freeze=True, sparse=False)
        
        self.gru1 = nn.GRU(self.embedding_dim, hidden_size,n_layers)
        self.gru2 = nn.GRU(hidden_size, hidden_size,n_layers)
        
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_seq, hidden):
        
        embedded = self.embedding(input_seq) # dim = Batch_Size x embedding_dim
        embedded = embedded.view(1, self.batch_size, self.embedding_dim) # S=1 x Batch_Size x embedding_dim
        
        rnn_output, hidden = self.gru1(embedded, hidden)
        output = F.relu(rnn_output)
        
        output, hidden = self.gru2(output, hidden)
        output = self.softmax(self.out(output[0]))
        
        return output,hidden



    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size).to(device)

In [0]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, output_size, n_layers=1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.max_length = max_length
        self.batch_size = BATCH_SIZE
        self.num_embeddings, self.embedding_dim = weights_matrix.size()

        #self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
#         self.embedding.weight.data.copy_(weights_matrix)
#         self.embedding.weight.requires_grad = True
        #self.embedding.from_pretrained(weights_matrix, freeze=True, sparse=False)
  
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
       
        self.gru1 = nn.GRU(self.embedding_dim, hidden_size,n_layers)
        self.gru2 = nn.GRU(hidden_size, hidden_size,n_layers)
        
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input_seq, hidden, encoder_outputs):
        embedded = self.embedding(input_seq).view(1, 1, -1)
 

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size).to(device)

In [0]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))
     
    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(this_batch_size, max_len)).to(device) # Batch_Size x Seq_Length

        
#         # For each batch of encoder outputs
        for b in range(this_batch_size):
            # Calculate energy for each encoder output
            for i in range(max_len):
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

        # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
        return F.softmax(attn_energies,dim=1).unsqueeze(1)
      
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
#             print (hidden.shape)
#             print (encoder_output.shape)
            energy = hidden[0].dot(encoder_output[0])
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = torch.mm(hidden[0], energy.transpose(0,1))
            return energy
        
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.v.dot(energy)
            return energy

In [0]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, weights_matrix, hidden_size, output_size, n_layers=1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.batch_size = BATCH_SIZE
        self.num_embeddings, self.embedding_dim = weights_matrix.size()


        # Define layers
        #self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
#         self.embedding.weight.data.copy_(weights_matrix)
#         self.embedding.weight.requires_grad = True
        self.embedding.from_pretrained(weights_matrix, freeze=False, sparse=False)
        
        
        self.gru1 = nn.GRU(self.embedding_dim, hidden_size,n_layers)
        self.gru2 = nn.GRU(hidden_size, hidden_size,n_layers)
        
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)

        embedded = self.embedding(input_seq) # dim = Batch_Size x embedding_dim
        embedded = embedded.view(1, self.batch_size, self.embedding_dim) # S=1 x Batch_Size x embedding_dim

        # Get current hidden state from input word and last hidden state
        # rnn_output : [1 x batch_size x hidden_size]
        # hidden: [layer x batch_size x hidden_size]
        rnn_output, hidden = self.gru1(embedded, last_hidden)
        
        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

        # Attentional vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(concat_output)

        #Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights
        #return attn_weights

In [0]:
class BahdanauAttnDecoderRNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, output_size, n_layers=1):
        super(BahdanauAttnDecoderRNN, self).__init__()
        
        # Define parameters
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.max_length = max_length
        self.batch_size = BATCH_SIZE
        self.num_embeddings, self.embedding_dim = weights_matrix.size()
        
        
        # Define layers
        #self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
#         self.embedding.weight.data.copy_(weights_matrix)
#         self.embedding.weight.requires_grad = True
        
        self.attn = Attn('concat', hidden_size)
        self.gru1 = nn.GRU(self.embedding_dim, hidden_size,n_layers)
        self.gru2 = nn.GRU(hidden_size, hidden_size,n_layers)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, word_input, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
        # TODO: FIX BATCHING
        
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
        
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
        context = context.transpose(0, 1) # 1 x B x N
        
        # Combine embedded input word and attended context, run through RNN
        rnn_input = torch.cat((word_embedded, context), 2)
        output, hidden = self.gru(rnn_input, last_hidden)
        
        # Final output layer
        output = output.squeeze(0) # B x N
        output = F.log_softmax(self.out(torch.cat((output, context), 1)),dim=1)
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

## Training Function

In [0]:
#record the run time
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [0]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

### Loss function

In [0]:
def sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = BATCH_SIZE
    seq_range = torch.arange(0, max_len).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    
    seq_range_expand = seq_range_expand.to(device)
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def masked_cross_entropy(logits, target, length):
    length = Variable(torch.LongTensor(length)).to(device)

    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.
    Returns:
        loss: An average loss value masked by the length.
    """
    
    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat,dim=1)
    
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss


In [0]:
#the train function is now taking a batch at a time
def train(input_batch, input_lengths, output_batch, output_lengths, encoder, decoder, encoder_optimizer, 
          decoder_optimizer, criterion, max_length=MAX_LENGTH, if_attention = True):
    
    encoder_outputs, encoder_hidden = encoder(input_batch, input_lengths, None)
  

    # Prepare decoder input and outputs
    decoder_input = Variable(torch.LongTensor([SOS_token] * BATCH_SIZE)).to(device)
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    all_decoder_outputs = Variable(torch.zeros(max_length, BATCH_SIZE, decoder.output_size)).to(device)
    
    # Run through decoder one time step at a time
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    # Teacher forcing: Feed the target as the next input
    if use_teacher_forcing:
        # Run through decoder one time step at a time
        for di in range(max_length):
            if if_attention == True:
                decoder_output, decoder_hidden, decoder_attn = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
            else:

                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden)
                
            all_decoder_outputs[di] = decoder_output # Store this step's outputs
            decoder_input = output_batch[di] # Next input is current target

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(max_length):
            if if_attention == True:
                decoder_output, decoder_hidden, decoder_attn = decoder(
                  decoder_input, decoder_hidden, encoder_outputs)
            else:

                decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
                
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            all_decoder_outputs[di] = decoder_output

    
    # Loss calculation and backpropagation
    loss = masked_cross_entropy(
            all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
            output_batch.transpose(0, 1).contiguous(), # -> batch x seq
            output_lengths)    

    loss.backward()
    ec = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    dc = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
    
    encoder_optimizer.step()
    decoder_optimizer.step()
#     ec=0
#     dc=0


    return loss.item(), ec, dc

In [0]:
def trainIters(iters, criterion,  encoder, decoder, encoder_optimizer, decoder_optimizer, n_iters, loss_list, print_every=1000, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    loss_list=[]
    loss_avg=[]
#     encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
#     decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    #iters = 0
  
    while iters <= n_iters:
      
      
      
      for i, (input_var,input_data_len,output_var,output_data_len) in enumerate(train_loader):
        print("Iteration:", iters)
        iters += 1
        input_batch = input_var.transpose(0,1).to(device)
        output_batch = output_var.transpose(0,1).to(device)
        
        loss, _, _ = train(input_batch,input_data_len,output_batch,output_data_len, encoder,
                       decoder, encoder_optimizer, decoder_optimizer, criterion)
        
        
        # Keep track of loss
        print_loss_total += loss
        plot_loss_total += loss
        
          
        print('Loss: %s (%d %d%%) %.4f' % (timeSince(start, iters / n_iters),
                                           iters, iters / n_iters * 100, loss))
        loss_list.append(loss)
        
        if iters % print_every == 0:
          
          
          
          print_loss_avg = print_loss_total/print_every
          print_loss_total = 0
          loss_avg.append(print_loss_avg)
          ##Learning Rate Decay
          if (len(loss_avg)!=1):
            loss_change  = loss_avg[-2]-loss_avg[-1]
            print ("loss_change: ", loss_change)
            if (loss_change < 0.05):
              
              print("Learning Rate Decays:")
              for param_group in encoder_optimizer.param_groups:
                
                param_group['lr'] = param_group['lr']*0.5
                print ("Current Encoder Learning Rate: {}". format (param_group['lr']))
              for param_group in decoder_optimizer.param_groups:
                
                param_group['lr'] = param_group['lr']*0.5
                print ("Current Decoder Learning Rate: {}". format (param_group['lr']))
                
              
          print('Average Loss: %s (%d %d%%) %.4f' % (timeSince(start, iters / n_iters),
                                           iters, iters / n_iters * 100, print_loss_avg))
        
          state = {'epoch': iters + 1, 'encoder_state_dict': encoder.state_dict(), 'decoder_state_dict': decoder.state_dict(),
             'encoder_optimizer': encoder_optimizer.state_dict(), 'decoder_optimizer': decoder_optimizer.state_dict(), "loss_list": loss_list, "loss_avg": loss_avg}
          
          torch.save(state, folder_path+"model_saved/Dec_3_state_{}.pt".format(iters))
        if iters % plot_every == 0:
         
          plot_loss_avg = plot_loss_total / plot_every
          plot_losses.append(plot_loss_avg)
          plot_loss_total = 0

    showPlot(plot_losses)

In [0]:
# Configure models
attn_model = 'dot'
hidden_size = 300
layers = 2
dropout = 0.1
batch_size = 64

# Configure training/optimization
clip = 50.0
start_epoch=0
teacher_forcing_ratio = 0.8
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iters = 100000
#def __init__(self, attn_model, weights_matrix, hidden_size, output_size, n_layers=1):
# def __init__(self, weights_matrix, input_size, hidden_size,n_layers=1):
# Initialize models
encoder = EncoderRNN(weights_matrix_zh, train_input_lang.n_words, hidden_size, n_layers = layers).to(device)
#decoder = DecoderRNN(weights_matrix_eng, hidden_size, train_output_lang.n_words, n_layers = layers).to(device)
decoder= LuongAttnDecoderRNN(attn_model, weights_matrix_eng, hidden_size, train_output_lang.n_words,n_layers = layers).to(device)
# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
criterion = nn.CrossEntropyLoss()


## Load pretrained model

In [0]:
def load_checkpoint(encoder, decoder, encoder_optimizer, decoder_optimizer, iteration_num):
    # Note: Input model & optimizer should be pre-defined.  This routine only updates their states.
    folder_path = os.getcwd() + '/gdrive/My Drive/NLP_Project/'
    start_epoch = 0
    filename=folder_path+"model_saved/Dec_3_state_{}.pt".format(iteration_num)
    loss_list=[]
    if os.path.isfile(filename):
        print("=> loading checkpoint '{}'".format(iteration_num))
        checkpoint = torch.load(filename, map_location=device)
        start_epoch = checkpoint['epoch']
        #model.load_state_dict(checkpoint['state_dict'])
        encoder.load_state_dict(checkpoint["encoder_state_dict"])
        decoder.load_state_dict(checkpoint["decoder_state_dict"])
        encoder_optimizer.load_state_dict(checkpoint["encoder_optimizer"])
        decoder_optimizer.load_state_dict(checkpoint["decoder_optimizer"])
        loss_list=checkpoint["loss_list"]
        #optimizer.load_state_dict(checkpoint['optimizer'])
        #losslogger = checkpoint['losslogger']
        print("=> loaded checkpoint '{}' (epoch {})"
                  .format(filename, checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(filename))

    return start_epoch, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_list

In [0]:
start_epoch, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_list=\
load_checkpoint(encoder, decoder, encoder_optimizer, decoder_optimizer, 30)

=> loading checkpoint '30'
=> loaded checkpoint '/content/gdrive/My Drive/NLP_Project/model_saved/Dec_3_state_30.pt' (epoch 31)


In [0]:
loss_list

[10.577936172485352,
 10.52320671081543,
 10.361044883728027,
 10.199304580688477,
 10.046380996704102,
 9.688304901123047,
 9.563981056213379,
 9.311548233032227,
 8.995036125183105,
 8.709113121032715,
 8.350419998168945,
 8.1212158203125,
 7.880978584289551,
 7.6278791427612305,
 7.702040195465088,
 7.477540969848633,
 7.219671726226807,
 7.276883602142334,
 7.116346836090088,
 6.888298034667969,
 6.862778663635254,
 6.7902445793151855,
 6.750735282897949,
 6.8259453773498535,
 6.785239219665527,
 6.698254585266113,
 6.700711727142334,
 6.77982759475708,
 6.759353160858154,
 6.729983329772949]

## Start Training

In [0]:
learning_rate = 0.0001
decoder_learning_ratio = 5.0
loss_list=[]
#learning_rate=learning_rate*0.5
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
trainIters(start_epoch, criterion, encoder, decoder, encoder_optimizer, decoder_optimizer, n_iters, loss_list, print_every=5, plot_every=1000000)

Iteration: 0
Loss: 3m 59s (- 399765m 12s) (1 0%) 10.6917
Iteration: 1
Loss: 7m 49s (- 391577m 57s) (2 0%) 10.6401
Iteration: 2
Loss: 11m 29s (- 382973m 1s) (3 0%) 10.5777
Iteration: 3
Loss: 15m 22s (- 384411m 42s) (4 0%) 10.5019
Iteration: 4
Loss: 19m 15s (- 384985m 50s) (5 0%) 10.4159
Average Loss: 19m 15s (- 384985m 56s) (5 0%) 10.5655
Iteration: 5
Loss: 22m 59s (- 383071m 22s) (6 0%) 10.2324
Iteration: 6
Loss: 26m 45s (- 382258m 24s) (7 0%) 10.0173
Iteration: 7
Loss: 30m 43s (- 384013m 38s) (8 0%) 9.8241
Iteration: 8
Loss: 34m 31s (- 383526m 0s) (9 0%) 9.4007
Iteration: 9
Loss: 38m 23s (- 383915m 39s) (10 0%) 9.2575
loss_change:  0.8191112518310533
Average Loss: 38m 23s (- 383915m 52s) (10 0%) 9.7464
Iteration: 10
Loss: 42m 16s (- 384347m 18s) (11 0%) 9.0042
Iteration: 11
Loss: 46m 9s (- 384661m 20s) (12 0%) 8.7215
Iteration: 12
Loss: 49m 59s (- 384550m 20s) (13 0%) 8.3480
Iteration: 13
Loss: 53m 28s (- 381894m 35s) (14 0%) 8.1373
Iteration: 14
Loss: 57m 25s (- 382751m 38s) (15 0%) 

## Evalutation

In [0]:
def load_checkpoint(encoder, decoder, encoder_optimizer, decoder_optimizer, iteration_num):
    # Note: Input model & optimizer should be pre-defined.  This routine only updates their states.
    folder_path = os.getcwd() + '/gdrive/My Drive/NLP_Project/'
    start_epoch = 0
    filename=folder_path+"model_saved/state_{}.pt".format(iteration_num)
    if os.path.isfile(filename):
        print("=> loading checkpoint '{}'".format(iteration_num))
        checkpoint = torch.load(filename, map_location=device)
        start_epoch = checkpoint['epoch']
        #model.load_state_dict(checkpoint['state_dict'])
        encoder.load_state_dict(checkpoint["encoder_state_dict"])
        decoder.load_state_dict(checkpoint["decoder_state_dict"])
        encoder_optimizer.load_state_dict(checkpoint["encoder_optimizer"])
        decoder_optimizer.load_state_dict(checkpoint["decoder_optimizer"])
        #optimizer.load_state_dict(checkpoint['optimizer'])
        #losslogger = checkpoint['losslogger']
        print("=> loaded checkpoint '{}' (epoch {})"
                  .format(filename, checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(filename))

    return start_epoch, encoder, decoder, encoder_optimizer, decoder_optimizer

In [0]:
start_epoch, encoder, decoder, encoder_optimizer, decoder_optimizer=\
load_checkpoint(encoder, decoder, encoder_optimizer, decoder_optimizer, 190)

NameError: ignored

In [0]:
def evaluate_randomly(pairs, input_lang, output_lang):
    [input_sentence, target_sentence] = random.choice(pairs)
    evaluate_and_show_attention(input_sentence, input_lang, output_lang, target_sentence)

In [0]:
def show_attention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    show_plot_visdom()
    plt.show()
    plt.close()

In [0]:
def evaluate_and_show_attention(input_sentence, input_lang, output_lang, target_sentence=None):
    output_words, attentions = evaluate(input_sentence, input_lang, output_lang)
    output_sentence = ' '.join(output_words)
    print('>', input_sentence)
    if target_sentence is not None:
        print('=', target_sentence)
    print('<', output_sentence)
    
#     show_attention(input_sentence, output_words, attentions)
    
#     # Show input, target, output text in visdom
#     win = 'evaluted (%s)' % hostname
#     text = '<p>&gt; %s</p><p>= %s</p><p>&lt; %s</p>' % (input_sentence, target_sentence, output_sentence)
#     vis.text(text, win=win, opts={'title': win})

In [0]:
def indexes_from_sentence(lang, sentence):
    index_list = []
    for word in sentence.split(' '):
      if (word in lang.word2index.keys()):
        index = lang.word2index[word] 
      else:
        index = UNK_IDX
      index_list.append(index)
    
    return index_list + [EOS_token]

In [0]:
evaluate_randomly(train_pairs, train_input_lang, train_output_lang)

那 是 妳 的 一部 一部分 部分
tensor([[ 110],
        [   9],
        [8050],
        [   5],
        [   3],
        [6929],
        [ 286],
        [   1]])
[17]


  """


RuntimeError: ignored

In [0]:
MAX_LENGTH

30

In [0]:
def evaluate(input_seq, input_lang, output_lang, max_length=MAX_LENGTH):

    
    
    input_seqs = [indexesFromSentence(input_lang, input_seq)]
    
    input_lengths = [len(input_seq.split())]
    input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
    
    input_batches = input_batches.to(device)

    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)
   
    

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([SOS_token]), volatile=True) # SOS
    print (encoder_hidden.shape)
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    
    decoder_input = decoder_input.to(device)

    # Store output words and attention states
    decoded_words = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)
    #return decoder_input, decoder_attentions
    # Run through decoder
#     print(decoder_input)
#     print(decoder_hidden.shape)
    
    for di in range(max_length):
      
        print(decoder_input)
        print(decoder_hidden.shape)
        
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )
        decoder_attentions[di,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        print(ni.data.tolist())
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[ni.data.tolist()])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        decoder_input = decoder_input.to(device)
    # Set back to training mode
    encoder.train(True)
    decoder.train(True)
    
    return decoded_words, decoder_attentions[:di+1, :len(encoder_outputs)]

In [0]:
for i in range(64):
  [input_sentence, target_sentence] = random.choice(train_pairs)
  input_sentence

In [0]:
#[input_sentence, target_sentence] = random.choice(train_pairs)
input_sentence="我 爱 你"
input_batches, input_lengths=evaluate(input_sentence, train_input_lang, train_output_lang, beam_size=3, max_length=MAX_LENGTH)

TypeError: ignored

In [0]:
def old_evaluate(input_seq, input_lang, output_lang, max_length=MAX_LENGTH):
    print (input_seq)
    input_lengths = [len(input_seq)]
    input_seqs = [indexes_from_sentence(input_lang, input_seq)]
    input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
    
    input_batches = input_batches.to(device)
        
    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)
    print (input_batches)
    print (input_lengths)
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([SOS_token]), volatile=True) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    
    decoder_input = decoder_input.to(device)

    # Store output words and attention states
    decoded_words = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )
        decoder_attentions[di,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[ni])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if USE_CUDA: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)
    
    return decoded_words, decoder_attentions[:di+1, :len(encoder_outputs)]

In [0]:
# input_seq = "你"
# input_lang = train_input_lang 
# output_lang = train_output_lang 
# max_length = 1
# beam_size = 10

def to_output_lang(output_list):
  result=[]
  for token_index in output_list:
    token=output_lang.index2word[token_index]
    result.append(token)
  return result

def evaluate_beam_search(encoder, decoder, input_seq, input_lang, output_lang, max_length=MAX_LENGTH, beam_size = 10):
  with torch.no_grad():

    input_seqs = [indexesFromSentence(input_lang, input_seq)]
    #print (input_seqs)
    input_lengths = [len(input_seq.split())]
    input_batches = Variable(torch.LongTensor(input_seqs)).transpose(0, 1)

    input_batches = input_batches.to(device)

    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)



    # Create starting vectors for decoder
    #decoder_input = Variable(torch.LongTensor([SOS_token])) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    sequences=[[[SOS_token], 1.0]]
    decoder_attentions = torch.zeros(max_length, max_length)
    for di in range(max_length):

      for sequence in sequences:
        sequence_list, score = sequence
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        for word in sequence_list:
          #print ("word:", word)
          word = Variable(torch.LongTensor([word])).to(device)
          decoder_output, decoder_hidden, decoder_attention = decoder(
          word, decoder_hidden, encoder_outputs
        )
        
        #decoder_attentions[di,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data

        #print ("decoder_output.shape", decoder_output.shape)
        output_prob=F.softmax(decoder_output.data)
        topv, topi = output_prob.topk(30000)
        candidates=[]
        for i in range (30000):
          prob = topv[0][i].data.tolist()
          toekn = [topi[0][i].data.tolist()]
          candidates.append([sequence_list+toekn, prob*score])

        sequences=sorted(candidates, key=lambda tup: tup[1], reverse=True)[0:beam_size]
        sequences_word=[[to_output_lang(x[0]), x[1]] for x in sequences]
  return sequences_word
      

In [0]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_IDX for word in sentence.split(' ')] + [EOS_token]

In [0]:
input_seq = "我 讨厌 你"
input_lang = train_input_lang 
output_lang = train_output_lang 
max_length = 7
beam_size = 3
encoder.batch_size=1
decoder.batch_size=1
evaluate_beam_search(encoder, decoder, input_seq, input_lang, output_lang, max_length=5, beam_size = 3)



[[['<EOS>', 'they', 'they', 'they', 'they', 'a'], 1.3731330188136913e-05],
 [['<EOS>', 'they', 'they', 'they', 'they', '<PAD>'], 4.5850020912157005e-06],
 [['<EOS>', 'they', 'they', 'they', 'they', 'they'], 4.5684426458255714e-06]]

In [0]:
def evaluate_and_show_attention(input_sentence, input_lang, output_lang, target_sentence=None):
    print('>', input_sentence)
    
    if target_sentence is not None:
        print('=', target_sentence)
    
    sequences_word = evaluate_beam_search(encoder, decoder, input_sentence, input_lang, output_lang, max_length=10, beam_size = 4)
    
    output_sentence = ' '.join(sequences_word[0][0])

    print('<', output_sentence)
    
#     show_attention(input_sentence, output_words, attentions)
    
#     # Show input, target, output text in visdom
#     win = 'evaluted (%s)' % hostname
#     text = '<p>&gt; %s</p><p>= %s</p><p>&lt; %s</p>' % (input_sentence, target_sentence, output_sentence)
#     vis.text(text, win=win, opts={'title': win})

In [0]:
def evaluate_randomly(pairs, input_lang, output_lang):
    [input_sentence, target_sentence] = random.choice(pairs)
    evaluate_and_show_attention(input_sentence, input_lang, output_lang, target_sentence)

In [0]:
evaluate_randomly(train_pairs, train_input_lang, train_output_lang)

> 他们 会 变成 这样 的 消费 消费者    和 你 我 一样   一无所知 无所 所知 的 消费 消费者
= They d go back to being consumers clueless consumers like we are most of the time .




< <SOS> of of of of of of of of of a


In [0]:
import sacrebleu
from detok import detok
import numpy as np

def bleu(itos, translation_output, reference):
    '''
    Args:
        arg.vocab.itos: a list the match indices to string.
        translation_output: 2D tensor of tranlation output. shape: N x B
        reference: 1D list of reference sentences (words, not indices). len(reference) = B
    '''
    EN_ind2word = np.array(itos)
    detok_translation = detok(translation_output, EN_ind2word)
    bleu_score = sacrebleu.raw_corpus_bleu(detok_translation, [reference], .01).score
    

    return bleu_score

def bleu_epoch(itos, translation_outputs, reference):
    '''
    Args:
        trg.vocab.itos: a list the match indices to string.
        translation_output: 2D tensor of tranlation output. shape: N x B
        reference: 1D list of reference sentences (words, not indices). len(reference) = B
    '''
    EN_ind2word = np.array(itos)
    detok_translation = []
    for translation_output in translation_outputs:
        detok_translation.extend(detok(translation_output, EN_ind2word))
    bleu_score = sacrebleu.raw_corpus_bleu(detok_translation, [reference], .01).score
    

    return bleu_score


In [0]:
def tokenize_13a(line):
    """
    Tokenizes an input line using a relatively minimal tokenization that is however equivalent to mteval-v13a, used by WMT.
    :param line: a segment to tokenize
    :return: the tokenized line
    """

    norm = line

    # language-independent part:
    norm = norm.replace('<skipped>', '')
    norm = norm.replace('-\n', '')
    norm = norm.replace('\n', ' ')
    norm = norm.replace('&quot;', '"')
    norm = norm.replace('&amp;', '&')
    norm = norm.replace('&lt;', '<')
    norm = norm.replace('&gt;', '>')

    # language-dependent part (assuming Western languages):
    norm = " {} ".format(norm)
    norm = re.sub(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', norm)
    norm = re.sub(r'([^0-9])([\.,])', '\\1 \\2 ', norm)  # tokenize period and comma unless preceded by a digit
    norm = re.sub(r'([\.,])([^0-9])', ' \\1 \\2', norm)  # tokenize period and comma unless followed by a digit
    norm = re.sub(r'([0-9])(-)', '\\1 \\2 ', norm)  # tokenize dash when preceded by a digit
    norm = re.sub(r'\s+', ' ', norm)  # one space only between words
    norm = re.sub(r'^\s+', '', norm)  # no leading space
    norm = re.sub(r'\s+$', '', norm)  # no trailing space

    return norm
TOKENIZERS = {
    '13a': tokenize_13a,

}
DEFAULT_TOKENIZER = '13a'

def corpus_bleu(sys_stream, ref_streams, smooth='exp', smooth_floor=0.0, force=False, lowercase=False,
                tokenize=DEFAULT_TOKENIZER, use_effective_order=False) -> BLEU:
    """Produces BLEU scores along with its sufficient statistics from a source against one or more references.
    :param sys_stream: The system stream (a sequence of segments)
    :param ref_streams: A list of one or more reference streams (each a sequence of segments)
    :param smooth: The smoothing method to use
    :param smooth_floor: For 'floor' smoothing, the floor to use
    :param force: Ignore data that looks already tokenized
    :param lowercase: Lowercase the data
    :param tokenize: The tokenizer to use
    :return: a BLEU object containing everything you'd want
    """

    # Add some robustness to the input arguments
    if isinstance(sys_stream, str):
        sys_stream = [sys_stream]
    if isinstance(ref_streams, str):
        ref_streams = [[ref_streams]]

    sys_len = 0
    ref_len = 0

    correct = [0 for n in range(NGRAM_ORDER)]
    total = [0 for n in range(NGRAM_ORDER)]

    # look for already-tokenized sentences
    tokenized_count = 0

    fhs = [sys_stream] + ref_streams
    for lines in zip_longest(*fhs):
        if None in lines:
            raise EOFError("Source and reference streams have different lengths!")

        if lowercase:
            lines = [x.lower() for x in lines]

        if not (force or tokenize == 'none') and lines[0].rstrip().endswith(' .'):
            tokenized_count += 1

            if tokenized_count == 100:
                logging.warning('That\'s 100 lines that end in a tokenized period (\'.\')')
                logging.warning('It looks like you forgot to detokenize your test data, which may hurt your score.')
                logging.warning('If you insist your data is detokenized, or don\'t care, you can suppress this message with \'--force\'.')

        output, *refs = [TOKENIZERS[tokenize](x.rstrip()) for x in lines]

        ref_ngrams, closest_diff, closest_len = ref_stats(output, refs)

        sys_len += len(output.split())
        ref_len += closest_len

        sys_ngrams = extract_ngrams(output)
        for ngram in sys_ngrams.keys():
            n = len(ngram.split())
            correct[n-1] += min(sys_ngrams[ngram], ref_ngrams.get(ngram, 0))
            total[n-1] += sys_ngrams[ngram]

    return compute_bleu(correct, total, sys_len, ref_len, smooth, smooth_floor, use_effective_order)

NameError: ignored

In [0]:

#The BLEU score consists of two parts, modified precision and brevity penalty. Details can be seen in the paper. You can use the nltk.align.bleu_score module inside the NLTK. One code example can be seen as below:

import nltk

hypothesis = ['It', 'is', 'a', 'cat', 'at', 'room']
reference = ['It', 'is', 'a', 'cat', 'inside', 'the', 'room']
#there may be several references
BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
print (BLEUscore)

0.4548019047027907
0.816496580927726


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
