In [1]:
from __future__ import unicode_literals, print_function, division
from io import open #匯入資料
import glob #檔案路徑的設定，可以用於*的擴展

def findFiles(path): 
    return glob.glob(path)

print(findFiles('data/names/*.txt'))


import unicodedata #編碼處理
import string

#為了方便後面做Index用，建立a-z,A-Z的26*2個英文字母，加上5個特殊符號，共57個符號
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)


#將Unicode轉ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )


# 構建category_lines，放置每個分類的內容(像是名字內容)
category_lines = {} #分類內容
all_categories = [] #分類項目（如：英文姓名、中文姓名）

# 讀取文件並分行
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [line for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = filename.split('/')[-1].split('.')[0]
    #18種類別
    all_categories.append(category)
    lines = readLines(filename) #每一個檔案裡面的內容
    #category_lines 內容組合，類別：名字List，也就是一個類別裡面，有Ｎ個要被分類的內容
    category_lines[category] = lines 

n_categories = len(all_categories) #n_categories獲得分類總數

['data/names/Arabic.txt', 'data/names/Chinese.txt', 'data/names/Czech.txt', 'data/names/Dutch.txt', 'data/names/English.txt', 'data/names/French.txt', 'data/names/German.txt', 'data/names/Greek.txt', 'data/names/Irish.txt', 'data/names/Italian.txt', 'data/names/Japanese.txt', 'data/names/Korean.txt', 'data/names/Polish.txt', 'data/names/Portuguese.txt', 'data/names/Russian.txt', 'data/names/Scottish.txt', 'data/names/Spanish.txt', 'data/names/Vietnamese.txt']


In [2]:
import torch

# 建立函式方便尋找all_letters的Index像是"a"=10000000000....
def letterToIndex(letter):
    return all_letters.find(letter)

# 將一個字母轉成Tensor顯示
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters) #建立一個1*57的Tensor
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# 建立one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print(letterToTensor('b')) #將內容轉為字母 #ex. a,b,c..
print(lineToTensor('Jones').size()) #5為字母個數、1為維度、57為總字母

tensor([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])
torch.Size([5, 1, 57])


In [3]:
import torch.nn as nn

 
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size) #做input線性轉換
        self.i2o = nn.Linear(input_size + hidden_size, output_size) #做output線性轉換
        self.softmax = nn.LogSoftmax(dim=1) #透過Softmax做歸一化

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1) #將輸入層與隱藏層做整合
        hidden = self.i2h(combined) #將輸入層、隱藏層 
        output = self.i2o(combined) #將輸入層、隱藏層
        output = self.softmax(output)#透過Softmax做歸一化
        return output, hidden

    def initHidden(self): #初始化隱藏層
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories) #57個字母,隱藏層數,分類共18個語言

In [4]:
def categoryFromOutput(output): #透過topk取出比較好的結果
    top_n, top_i = output.topk(1) #Pytorch內建topk()
    category_i = top_i[0].item() 
    return all_categories[category_i], category_i

#print(categoryFromOutput(output))

In [5]:
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)] #取0~ 分類個數減1的範圍 0~17

def randomTrainingExample():
    category = randomChoice(all_categories) #隨機選擇分類
    line = randomChoice(category_lines[category]) #取得該分類的細部資料
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)#取得整個分類的Index
    line_tensor = lineToTensor(line) #將分類細部資料轉換one-hot vector
    return category, line, category_tensor, line_tensor #分類名稱、分類的細部資料、分類的Index、One-hot vector的結果

for i in range(5):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    print('category =', category, '/ line =', line)

category = Russian / line = Zhorin
category = French / line = Sarkozy
category = English / line = Rolfe
category = Korean / line = Koo
category = Korean / line = Mo


In [6]:
#衡量標準
criterion = nn.NLLLoss() #用於多分類模型，全名Negative Log Likelihood，通常用於最後一層是softmax

In [7]:
learning_rate = 0.005

def train(category_tensor, line_tensor):
    hidden = rnn.initHidden() #初始化隱藏層

    rnn.zero_grad() #初始化優化的狀態

    for i in range(line_tensor.size()[0]): #取得每個細項的矩陣內容，每一個都跑一次RNN
        output, hidden = rnn(line_tensor[i], hidden) #rnn(input[0], hidden) 輸出結果

    loss = criterion(output, category_tensor) #透過NLLLoss獲得Loss
    loss.backward() #進行反向傳播

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters(): #依據每一層做處理
        p.data.add_(-learning_rate, p.grad.data)#從每一層加上學習速率

    return output, loss.item() #回傳運算成果

In [None]:
import time
import math

n_iters = 100000 #100000
print_every = 5000 #每print_every印出
plot_every = 1000 #每plot_every印出


# Keep track of losses for plotting
current_loss = 0 #初始為0
all_losses = [] 

def timeSince(since): #計算花費時間
    now = time.time()
    s = now - since #現在時間 減掉 開始時間
    m = math.floor(s / 60) #將秒數整理一下
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1): #n_iters + 1 因為從0開始算
    category, line, category_tensor, line_tensor = randomTrainingExample() #取得隨機的資料
    output, loss = train(category_tensor, line_tensor) #進行訓練
    current_loss += loss #將Loss做累計

    # Print iter number, loss, name and guess
    if iter % print_every == 0: #當迴圈 除以print_every餘數為0則印出
        guess, guess_i = categoryFromOutput(output) #取得output結果，放入categoryFromOutput判斷是否為同一個
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

    # Add current loss avg to list of losses
    if iter % plot_every == 0: #提供給plot_every不能比print_every多，因為plot_every要畫圖。
        all_losses.append(current_loss / plot_every)
        current_loss = 0

5000 5% (0m 5s) 2.2582 Lac / Vietnamese ✓
10000 10% (0m 10s) 3.0083 Neil / Korean ✗ (Irish)


In [12]:
# 回傳輸出
def evaluate(line_tensor):
    hidden = rnn.initHidden()
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    return output


#進行數值預測
def predict(input_line, n_predictions=5): #n_predictions設定要預測的數量
    print('\n> %s' % input_line) #印出名稱
    with torch.no_grad(): #no_grad在上下文中切掉梯度
        output = evaluate(lineToTensor(input_line))

        # 獲得N個類別
        topv, topi = output.topk(n_predictions, 1) #n_predictions為個數、維度為1
        #topv原始值、topi索引值
        predictions = []

        for i in range(n_predictions): #取得預測結果
            value = topv[0][i].item() #取得第i個值
            category_index = topi[0][i].item() #取得第i個值的索引值
            print(all_categories[category_index]) #category_index就是預測的分類結果
            #print('(%.2f) %s' % (value, all_categories[category_index]))
            predictions.append([value, all_categories[category_index]])

predict('Bill')
predict('Jerry')


> Bill
English
Irish
German
Czech
Scottish

> Jerry
English
Czech
Scottish
French
German
