## Embedding

### Sparse representation

In [1]:
import pandas as pd
class2 = pd.read_csv('../chap10/data/class2.csv')

from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

train_x = label_encoder.fit_transform(class2['class2'])
train_x

array([2, 2, 1, 0, 1, 0])

### Counter vector

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['This is last chance.',
         'and if you do not have this chance.',
         'you will never get any chance.',
         'will you do get this one?',
         'please, get this chance'
         ]

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'this': 13,
 'is': 7,
 'last': 8,
 'chance': 2,
 'and': 0,
 'if': 6,
 'you': 15,
 'do': 3,
 'not': 10,
 'have': 5,
 'will': 14,
 'never': 9,
 'get': 4,
 'any': 1,
 'one': 11,
 'please': 12}

In [3]:
vect.transform(['you will never get any chance']).toarray()

array([[0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [4]:
vect = CountVectorizer(stop_words=["and","is","please","this"]).fit(corpus)
vect.vocabulary_

{'last': 6,
 'chance': 1,
 'if': 5,
 'you': 11,
 'do': 2,
 'not': 8,
 'have': 4,
 'will': 10,
 'never': 7,
 'get': 3,
 'any': 0,
 'one': 9}

### TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

doc = ['I like machine learning', 'I love deep learning', 'I run everything']
tfidf_vectorizer = TfidfVectorizer(min_df=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(doc)
doc_distance = (tfidf_matrix*tfidf_matrix.T)
print('유사도를 위한', str(doc_distance.get_shape()[0]), 'x', str(doc_distance.get_shape()[1]), '행렬을 만들었습니다')
print(doc_distance.toarray())

유사도를 위한 3 x 3 행렬을 만들었습니다
[[1.       0.224325 0.      ]
 [0.224325 1.       0.      ]
 [0.       0.       1.      ]]


### Word2Vec

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action='ignore')
import gensim
from gensim.models import Word2Vec

sample = open('../chap10/data/peter.txt','r', encoding='UTF8')
s = sample.read()

f =s.replace('\n', '')
data = []

for i in sent_tokenize(f):
    temp = []
    
    for j in word_tokenize(i):
        temp.append(j.lower())
        
    data.append(temp)
    
data

[['once',
  'upon',
  'a',
  'time',
  'in',
  'london',
  ',',
  'the',
  'darlings',
  'went',
  'out',
  'to',
  'a',
  'dinner',
  'party',
  'leaving',
  'their',
  'three',
  'children',
  'wendy',
  ',',
  'jhon',
  ',',
  'and',
  'michael',
  'at',
  'home',
  '.'],
 ['after',
  'wendy',
  'had',
  'tucked',
  'her',
  'younger',
  'brothers',
  'jhon',
  'and',
  'michael',
  'to',
  'bed',
  ',',
  'she',
  'went',
  'to',
  'read',
  'a',
  'book',
  '.'],
 ['she', 'heard', 'a', 'boy', 'sobbing', 'outside', 'her', 'window', '.'],
 ['he', 'was', 'flying', '.'],
 ['there', 'was', 'little', 'fairy', 'fluttering', 'around', 'him', '.'],
 ['wendy',
  'opened',
  'the',
  'window',
  'to',
  'talk',
  'to',
  'him.',
  '“',
  'hello',
  '!'],
 ['who', 'are', 'you', '?'],
 ['why', 'are', 'you', 'crying', '”', ',', 'wendy', 'asked', 'him', '.'],
 ['“', 'my', 'name', 'is', 'peter', 'pan', '.'],
 ['my',
  'shadow',
  'wouldn',
  '’',
  't',
  'stock',
  'to',
  'me.',
  '”',
  ',',
 

### CBOW

In [7]:
model1 = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=0)
print("Cosine similarity between 'peter' " + "'wendy' - CBOW: ", model1.wv.similarity('peter', 'wendy'))

Cosine similarity between 'peter' 'wendy' - CBOW:  -0.09961489


In [8]:
print("Cosine similarity between 'peter' " + "'hook' - CBOW: ", model1.wv.similarity('peter', 'hook'))

Cosine similarity between 'peter' 'hook' - CBOW:  0.029662758


### Skip-gram

In [9]:
model2 = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
print("Cosine similarity between 'peter' " + "'wendy' - CBOW: ", model2.wv.similarity('peter', 'wendy'))

Cosine similarity between 'peter' 'wendy' - CBOW:  0.2726038


In [10]:
print("Cosine similarity between 'peter' " + "'hook' - CBOW: ", model2.wv.similarity('peter', 'hook'))

Cosine similarity between 'peter' 'hook' - CBOW:  0.4720227


### FastText

In [11]:
from gensim.test.utils import common_texts
from gensim.models import FastText

model = FastText('../chap10/data/peter.txt', vector_size=4, window=3, min_count=1, epochs=10)

In [12]:
sim_score = model.wv.similarity('peter', 'wendy')
print(sim_score)

0.4592452


In [13]:
sim_score = model.wv.similarity('peter', 'hook')
print(sim_score)

0.043825716


In [14]:
from __future__ import print_function
from gensim.models import KeyedVectors

model_kr = KeyedVectors.load_word2vec_format('../chap10/data/wiki.ko.vec')

In [15]:
find_similar_to = '노력'

for similar_word in model_kr.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(similar_word[0],similar_word[1]))

Word: 노력함, Similarity: 0.80
Word: 노력중, Similarity: 0.75
Word: 노력만, Similarity: 0.72
Word: 노력과, Similarity: 0.71
Word: 노력의, Similarity: 0.69
Word: 노력가, Similarity: 0.69
Word: 노력이나, Similarity: 0.69
Word: 노력없이, Similarity: 0.68
Word: 노력맨, Similarity: 0.68
Word: 노력보다는, Similarity: 0.68


In [16]:
similarities = model_kr.most_similar(positive=['동물', '육식동물'], negative=['사람'])
print(similarities)

[('초식동물', 0.7804121971130371), ('거대동물', 0.7547270059585571), ('육식동물의', 0.7547166347503662), ('유두동물', 0.753511369228363), ('반추동물', 0.7470757961273193), ('독동물', 0.7466291785240173), ('육상동물', 0.7460315823554993), ('유즐동물', 0.7450904250144958), ('극피동물', 0.7449344396591187), ('복모동물', 0.742434561252594)]


### Glove

In [17]:
# import numpy as np
# %matplotlib notebook
# import matplotlib.pyplot as plt
# plt.style.use('ggplot')
# from sklearn.decomposition import PCA
# from gensim.test.utils import datapath, get_tmpfile
# from gensim.models import KeyedVectors
# from gensim.scripts.glove2word2vec import glove2word2vec

# glove_file = datapath('../chap10/data/glove.6B.100d.txt')
# word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
# glove2word2vec(glove_file, word2vec_glove_file)

In [18]:
# model = KeyedVectors.load_word2vec_format(word2vec_glove_file)
# model.most_similar('bill')

In [19]:
# model.most_similar('cherry') 

In [20]:
# model.most_similar(negative='cherry')

In [21]:
# result = model.most_similar(positive=['woman', 'king'], negative=['man'])
# print("{}: {:.4f}".format(*result[0]))

In [22]:
# def analogy(x1, x2, y1):
#     result = model.most_similar(positive=[y1, x2], negative=[x1])
#     return result[0][0]
# analogy('australia', 'beer', 'france')

In [23]:
# analogy('tall', 'tallest', 'long')

In [24]:
# print(model.doesnt_match("breakfast cereal dinner lunch".split()))

## Transformer Attention

### seq2seq

In [25]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd

import os
import re
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 20

class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1:"EOS"}
        self.n_words = 2
        
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
            
    def addWord(self,word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words +=1
        else:
            self.word2count[word] += 1

In [27]:
def normalizeString(df, lang):
    sentence = df[lang].str.lower()
    sentence = sentence.str.replace('[^A-Za-z\s]+', ' ')
    sentence = sentence.str.normalize('NFD')
    sentence = sentence.str.encode('ascii', errors='ignore').str.decode('utf-8')
    return sentence

def read_sentence(df, lang1, lang2):
    sentence1 = normalizeString(df, lang1)
    sentence2 = normalizeString(df, lang2)
    return sentence1, sentence2

def read_file(loc, lang1, lang2):
    df = pd.read_csv(loc, delimiter='\t', header=None, names=[lang1,lang2])
    return df

def process_data(lang1, lang2):
    df = read_file('../chap10/data/%s-%s.txt' % (lang1, lang2), lang1, lang2)
    sentence1, sentence2 = read_sentence(df, lang1, lang2)
    
    input_lang = Lang()
    output_lang = Lang()
    pairs = []
    
    for i in range(len(df)):
        if len(sentence1[i].split(' ')) < MAX_LENGTH and len(sentence2[i].split(' ')) < MAX_LENGTH :
            full = [sentence1[i], sentence2[i]]
            input_lang.addSentence(sentence1[i])
            output_lang.addSentence(sentence2[i])
            pairs.append(full)
    
    return input_lang, output_lang, pairs

In [28]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(input_lang, output_lang, pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [29]:
class Encoder(nn.Module):
    def __init__(self,input_dim, hidden_dim, embedded_dim, num_layers):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.embedded_dim = embedded_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_dim, self.embedded_dim)
        self.gru = nn.GRU(self.embedded_dim, self.hidden_dim, num_layers=self.num_layers)
        
    def forward(self,src):
        embedded = self.embedding(src).view(1,1,-1)
        outputs, hidden = self.gru(embedded)
        return outputs, hidden

In [30]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, embedded_dim, num_layers):
        super(Decoder, self).__init__()
        
        self.embedded_dim = embedded_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(output_dim, self.embedded_dim)
        self.gru = nn.GRU(self.embedded_dim, self.hidden_dim, num_layers=self.num_layers)
        self.out = nn.Linear(self.hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        input = input.view(1,-1)
        embedded = F.relu(self.embedding(input))
        output, hidden = self.gru(embedded, hidden)
        prediction = self.softmax(self.out(output[0]))
        return prediction, hidden

In [31]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, MAX_LENGTH=MAX_LENGTH):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, input_lang, output_lang, teacher_forcing_ratio=0.5):
        input_length = input_lang.size(0)
        batch_size = output_lang.shape[1]
        target_length = output_lang.shape[0]
        vocab_size = self.decoder.output_dim
        outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)
        
        for i in range(input_length):
            encoder_output, encoder_hidden = self.encoder(input_lang[i])
            
        decoder_hidden = encoder_hidden.to(device)
        decoder_input = torch.tensor([SOS_token], device=device)
        
        for t in range(target_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            outputs[t] = decoder_output
            teacher_force = random.random() < teacher_forcing_ratio
            topv, topi = decoder_output.topk(1)
            input = (output_lang[t] if teacher_force else topi)
            if (teacher_force==False and input.item() == EOS_token):
                break
        return outputs

In [32]:
teacher_forcing_ratio = 0.5

def Model(model, input_tensor, target_tensor, model_optimizer, criterion):
    model_optimizer.zero_grad()
    input_length = input_tensor.size(0)
    loss = 0
    epoch_loss = 0
    output = model(input_tensor, target_tensor)
    num_iter = output.size(0)
    
    for ot in range(num_iter):
        loss += criterion(output[ot], target_tensor[ot])
        
    loss.backward()
    model_optimizer.step()
    epoch_loss = loss.item()/num_iter
    return epoch_loss

In [33]:
def trainModel(model, input_lang, output_lang, pairs, num_iteration=20000):
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = nn.NLLLoss()
    total_loss_iterations = 0
    
    training_pairs = [tensorsFromPair(input_lang, output_lang, random.choice(pairs)) for i in range(num_iteration)]
    
    for iter in range(1, num_iteration+1):
        training_pair = training_pairs[iter-1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = Model(model, input_tensor, target_tensor, optimizer, criterion)
        total_loss_iterations += loss
        
        if iter % 5000 == 0 :
            average_loss = total_loss_iterations / 5000
            total_loss_iterations = 0
            print('%d %.4f' % (iter, average_loss))
            
    torch.save(model.state_dict(), '../chap10/data/mytraining.pt')
    return model

In [34]:
def evaluate(model, input_lang, output_lang, sentences, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentences[0])
        output_tensor = tensorFromSentence(output_lang, sentences[1])
        decoded_words = []
        output = model(input_tensor, output_tensor)
        
        for ot in range(output.size(0)):
            topv, topi = output[ot].topk(1)
            
            if topi[0].item() == EOS_token:
                decoded_words.append('<EOS>')
                break
                
            else:
                decoded_words.append(output_lang.index2word[topi[0].item()])
                
        return decoded_words
    
def evaluateRandomly(model, input_lang, output_lang, pairs, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('input {}'.format(pair[0]))
        print('output {}'.format(pair[1]))
        output_words = evaluate(model, input_lang, output_lang, pair)
        output_sentence = ' '.join(output_words)
        print('predicted {}'.format(output_sentence))

In [35]:
lang1 = 'eng'
lang2 = 'fra'
input_lang, output_lang, pairs = process_data(lang1, lang2)

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

input_size = input_lang.n_words
output_size = output_lang.n_words
print('Input: {} Output: {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1
num_iteration = 75000

encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)
model = Seq2Seq(encoder, decoder, device).to(device)

print(encoder)
print(decoder)

model = trainModel(model, input_lang, output_lang, pairs, num_iteration)

random sentence ["what's your favorite toothpaste?", 'quel est ton dentifrice prefere?']
Input: 23191 Output: 39387
Encoder(
  (embedding): Embedding(23191, 256)
  (gru): GRU(256, 512)
)
Decoder(
  (embedding): Embedding(39387, 256)
  (gru): GRU(256, 512)
  (out): Linear(in_features=512, out_features=39387, bias=True)
  (softmax): LogSoftmax(dim=1)
)
5000 5.0229
10000 4.7813
15000 4.7174
20000 4.6468
25000 4.6773
30000 4.6822
35000 4.6521
40000 4.6321
45000 4.6376
50000 4.5727
55000 4.5468
60000 4.5109
65000 4.5607
70000 4.5636
75000 4.4995


In [36]:
evaluateRandomly(model, input_lang, output_lang, pairs)

input tom takes everything too seriously.
output tom prend tout trop au serieux.
predicted je ne pas pas <EOS>
input you're not upset, are you?
output vous n'etes pas contrariee, si ?
predicted je ne pas pas <EOS>
input he seems to think so.
output il semble penser cela.
predicted je ne pas pas <EOS>
input you screwed up.
output vous avez merde.
predicted je ne pas pas
input i saw you there.
output je vous y ai vu.
predicted je ne pas pas <EOS>
input do you love your country?
output aimes-tu ton pays ?
predicted je ne pas pas <EOS>
input i love this time of year.
output j'adore cette periode de l'annee.
predicted je ne pas pas <EOS>
input he still hasn't returned the book he borrowed from the library.
output il n'a toujours pas ramene le livre qu'il avait emprunte a la bibliotheque.
predicted je ne pas pas <EOS>
input we're not together anymore.
output nous ne sommes plus ensemble.
predicted je ne pas pas <EOS>
input i'm impressed with your french.
output je suis impressionne par votre

In [37]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size*2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size*2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1,1,-1)
        embedded = self.dropout(embedded)
        
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]),1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]),1)
        output = self.attn_combine(output).unsqueeze(0)
        
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

In [38]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.1):
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(input_lang, output_lang, random.choice(pairs)) for i in range(n_iters)]
    criterion = nn.NLLLoss()
    
    for iter in range(1,n_iters+1):
        training_pair = training_pairs[iter-1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = Model(model,input_tensor,target_tensor,decoder_optimizer,criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        if iter % 5000 == 0 :
            print_loss_avg = print_loss_total/5000
            print_loss_total = 0
            print('%d, %.4f' % (iter, print_loss_avg))

In [39]:
import time

embed_size = 256
hidden_size = 512
num_layers = 1
input_size = input_lang.n_words
output_size = output_lang.n_words

encoder1 = Encoder(input_size, hidden_size, embed_size, num_layers)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_size, dropout_p=0.1).to(device)

print(encoder1)
print(attn_decoder1)

attn_model = trainIters(encoder1, attn_decoder1, 75000, print_every=5000, plot_every=100, learning_rate=0.01)

Encoder(
  (embedding): Embedding(23191, 256)
  (gru): GRU(256, 512)
)
AttnDecoderRNN(
  (embedding): Embedding(39387, 512)
  (attn): Linear(in_features=1024, out_features=20, bias=True)
  (attn_combine): Linear(in_features=1024, out_features=512, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(512, 512)
  (out): Linear(in_features=512, out_features=39387, bias=True)
)
5000, 4.9056
10000, 4.9521
15000, 4.9512
20000, 4.9380
25000, 4.9142
30000, 4.9488
35000, 4.9594
40000, 4.9212
45000, 4.9606
50000, 4.9800
55000, 4.9668
60000, 4.9376
65000, 4.9260
70000, 4.9326
75000, 4.9193


### Bert

In [40]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [41]:
train_df = pd.read_csv('../chap10/data/training.txt', sep='\t')
valid_df = pd.read_csv('../chap10/data/validing.txt', sep='\t')
test_df = pd.read_csv('../chap10/data/testing.txt', sep='\t')

In [42]:
train_df = train_df.sample(frac=0.1, random_state=500)
valid_df = valid_df.sample(frac=0.1, random_state=500)
test_df = test_df.sample(frac=0.1, random_state=500)

In [43]:
class Datasets(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 1]
        label = self.df.iloc[idx, 2]
        return text, label

In [44]:
train_dataset = Datasets(train_df)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=0)

valid_dataset = Datasets(valid_df)
valid_loader = DataLoader(valid_dataset, batch_size=2, shuffle=True, num_workers=0)

test_dataset = Datasets(test_df)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=True, num_workers=0)

In [45]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [46]:
def save_checkpoint(save_path, model, valid_loss):
    if save_path == None:
        return    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):    
    if load_path==None:
        return    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):
    if save_path == None:
        return    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_metrics(load_path):
    if load_path==None:
        return    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [47]:
def train(model,optimizer, criterion=nn.BCELoss(), num_epochs=5, eval_every=len(train_loader)//2, best_valid_loss=float("Inf")):
    total_correct = 0.0
    total_len = 0.0
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []
    
    model.train()
    
    for epoch in range(num_epochs):
        for text, label in train_loader:
            optimizer.zero_grad()
            encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
            padded_list = [e + [0]*(512-len(e)) for e in encoded_list]
            
            sample = torch.tensor(padded_list)
            sample, label = sample.to(device), label.to(device)
            labels = torch.tensor(label)
            outputs = model(sample, labels=labels)
            loss, logits = outputs
            
            pred = torch.argmax(F.softmax(logits), dim=1)
            correct = pred.eq(labels)
            total_correct += correct.sum().item()
            total_len += len(labels)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
            global_step += 1
            
            if global_step % eval_every == 0 :
                model.eval()
                with torch.no_grad():
                    for text, label in valid_loader:
                        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
                        padded_list = [e + [0]*(512-len(e)) for e in encoded_list]
                        sample = torch.tensor(padded_list)
                        sample, label = sample.to(device), label.to(device)
                        labels = torch.tensor(label)
                        outputs = model(sample, labels=labels)
                        loss, logits = outputs
                        valid_running_loss += loss.item()
                    
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)
                
                running_loss = 0.0
                valid_running_loss = 0.0
                model.train()
                
                print('Epoch [{}/{}], Step[{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'.format\
                      (epoch+1, num_epochs, global_step, num_epochs*len(train_loader), average_train_loss, average_valid_loss))
                
                if best_valid_loss > average_valid_loss :
                    best_valid_loss = average_valid_loss
                    save_checkpoint('../chap10/data/model.pt', model, best_valid_loss)
                    save_metrics('../chap10/data/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    save_metrics('../chap10/data/metrics.pt', train_loss_list, valid_loss+list, global_steps_list)
    print('훈련 종료!')

In [48]:
optimizer = optim.Adam(model.parameters(), lr = 2e-5)
train(model=model, optimizer=optimizer)

Epoch [1/5], Step[510/5100], Train Loss: 0.6999, Valid Loss: 0.6929
Model saved to ==> ../chap10/data/model.pt
Model saved to ==> ../chap10/data/metrics.pt
Epoch [1/5], Step[1020/5100], Train Loss: 0.7054, Valid Loss: 0.6996
Epoch [2/5], Step[1530/5100], Train Loss: 0.7043, Valid Loss: 0.7592
Epoch [2/5], Step[2040/5100], Train Loss: 0.7077, Valid Loss: 0.6938
Epoch [3/5], Step[2550/5100], Train Loss: 0.7072, Valid Loss: 0.6942
Epoch [3/5], Step[3060/5100], Train Loss: 0.7116, Valid Loss: 0.6987
Epoch [4/5], Step[3570/5100], Train Loss: 0.7095, Valid Loss: 0.7048
Epoch [4/5], Step[4080/5100], Train Loss: 0.7001, Valid Loss: 0.6926
Model saved to ==> ../chap10/data/model.pt
Model saved to ==> ../chap10/data/metrics.pt
Epoch [5/5], Step[4590/5100], Train Loss: 0.7000, Valid Loss: 0.6965
Epoch [5/5], Step[5100/5100], Train Loss: 0.7049, Valid Loss: 0.6926


NameError: name 'valid_loss' is not defined

In [None]:
train_loss_list, valid_loss_list, global_steps_list = load_metrics('../chap10/data/metrics.pt')
plt.plot(global_steps_list, train_loss_list, label='Train')
plt.plot(global_steps_list, valid_loss_list, label='Valid')
plt.xlabel('Global Steps')
plt.ylabel('Loss')
plt.legend()
plt.show() 

In [None]:
def evaluate(model, test_loader):
    y_pred = []
    y_true = []
    
    model.eval()
    with torch.no_grad():
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        padded_list = [e+[0]*(512-len(e)) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        output = model(sample, labels=labels)
        _, output = output
        y_pred.extend(torch.argmax(output,1).tolist())
        y_ture.extend(labels.tolist())
        
    print('Classification 결과:')
    print(classification_report(y_true, y_pred, labels=[1,0],digits=4))
    
    cm = confusion_matrix(y_true, y_pred, labels=[1,0], digits=4)
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, cmap='Blues', fmt='d')
    ax.set_title('Confusion Matrix')
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')
    ax.xaxis.set_ticklabels(['0', '1'])
    ax.yaxis.set_ticklabels(['0', '1'])

In [None]:
best_model = model.to(device)
load_checkpoint('../chap10/data/model.pt', best_model)
evaluate(best_model, test_loader)

## Korean embedding

In [None]:
import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
text = "나는 파이토치를 이용한 딥러닝을 학습 중이다."
marked_text = "[CLS]" + text + "[SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

In [None]:
text = "과수원에 사과가 많았다." \
       "친구가 나에게 사과했다."\
       "백설공주는 독이 든 사과를 먹었다."

marked_text = "[CLS] " + text + " [SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

In [None]:
segments_ids = [1]*len(tokenized_text)
print(segments_ids)

In [None]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [None]:
model = BertModel.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True)
model.eval()

In [None]:
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)
    hidden_states = outputs[2]

In [None]:
print ("계층 수:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
layer_i = 0

print ("배치 수:", len(hidden_states[layer_i]))
batch_i = 0

print ("토큰 수:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("은닉층 유닛 수:", len(hidden_states[layer_i][batch_i][token_i]))

In [None]:
print('은닉 상태의 유형: ', type(hidden_states))
print('각 계층에서의 텐서 형태: ', hidden_states[0].size())

In [None]:
token_embeddings = torch.stack(hidden_states, dim=0)
token_embeddings.size()

In [None]:
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()

In [None]:
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()

In [None]:
token_vecs_cat = []
for token in token_embeddings:
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    token_vecs_cat.append(cat_vec)
print ('형태는: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

In [None]:
token_vecs_sum = []
for token in token_embeddings:
    sum_vec = torch.sum(token[-4:], dim=0)
    token_vecs_sum.append(sum_vec)
print ('형태는: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

In [None]:
token_vecs = hidden_states[-2][0]
sentence_embedding = torch.mean(token_vecs, dim=0)
print ("최종 임베딩 벡터의 형태:", sentence_embedding.size())

In [None]:
for i, token_str in enumerate(tokenized_text):
    print (i, token_str)

In [None]:
print("사과가 많았다", str(token_vecs_sum[6][:5]))
print("나에게 사과했다", str(token_vecs_sum[10][:5]))
print("사과를 먹었다", str(token_vecs_sum[19][:5]))

In [None]:
from scipy.spatial.distance import cosine
diff_apple = 1 - cosine(token_vecs_sum[5], token_vecs_sum[27])
same_apple = 1 - cosine(token_vecs_sum[5], token_vecs_sum[16])
print('*유사한* 의미에 대한 벡터 유사성:  %.2f' % same_apple)
print('*다른* 의미에 대한 벡터 유사성:  %.2f' % diff_apple)