## Name Based Country Classification

#### 방법 2: 국적을 랜덤으로 선택한 뒤 그 국적 내 이름을 샘플링하여 학습함

클래스 불균형 문제를 완하하기 위해 국적을 랜덤하게 선택한 후, 국적 내 이름 데이터를 샘플링하여 학습합니다. <br>
(이를 위해 데이터를 국적별로 미리 분류하면 구현이 용이) 
<br>

### One Hot Encoding 을 위해 사용된 문자셋을 얻음

In [1]:
import pandas as pd
from collections import Counter

df = pd.read_csv('./name_country.csv')
text_data = df['Name'].tolist()
label_data = df['Country'].tolist()

country_list = sorted(set(label_data))
country_count = len(country_list)

data_dict = {} #key-country, value - list of names
for name, country in zip(text_data, label_data):
    if country not in data_dict:
        data_dict[country] = []
    data_dict[country].append(name)

FileNotFoundError: [Errno 2] No such file or directory: './deepLearning/rnn/name_country.csv'

### Name Character Sets

In [None]:

unique_chars = set()

for name in df['Name']:
    unique_chars.update(name)
unique_chars = sorted(list(unique_chars))
all_letters = ''.join(unique_chars)
print(all_letters)

###  Name to One-Hot Encoded Tensor


In [36]:

import torch
n_letters = len(all_letters)

def nameToTensor(name):
    tensor = torch.zeros(len(name), n_letters)
    for char_idx, char in enumerate(name):
        letter_idx = all_letters.find(char)
        assert letter_idx != -1, f"char is {name}, {char}"
        tensor[char_idx][letter_idx] = 1
    return tensor

### Create a RNN Model

In [37]:
from xd_rnn import XD_RNN

# 은닉층 수
n_hidden = 32
# 입력층 수, 은닉층 수, 출력층 수
rnn_model = XD_RNN(n_letters, n_hidden, country_count)

# 학습률
learning_rate = 0.001
# 학습 횟수
iter_count = 100000

# 학습 상태 출력 기준 횟수
print_iter_count = 5000

### Model Trainning

In [None]:
import random
from torch.optim import Adam, SGD
loss_fn = nn.CrossEntropyLoss()

optimizer = Adam(rnn_model.parameters(), lr=0.001) #Adam 1.2647557258605
iter_count = 100000
crnt_loss = 0.
correct_predictions = 0

for iter_idx in range(iter_count):
    rnn_model.train()
    random_country = random.choice(list(data_dict.keys()))
    random_name = random.choice(data_dict[random_country])

    name_tensor = nameToTensor(random_name)
    country_tensor = torch.tensor([country_list.index(random_country)], dtype=torch.long)
    hidden = rnn_model.get_hidden()
    rnn_model.zero_grad()

    for char_idx in range(len(random_name)):
        char_tensor = name_tensor[char_idx]
        output, hidden = rnn_model(char_tensor[None,:],hidden)

    loss = loss_fn(output, country_tensor)
    loss.backward()
    optimizer.step()

    crnt_loss += loss.item()
    predicted_index = torch.argmax(output, 1)
    correct_predictions += (predicted_index == country_tensor).sum().item()

    if iter_idx % 5000 == 0 and iter_idx != 0:
        average_loss = crnt_loss / 5000
        accuracy = 100 * correct_predictions / 5000
        print(f'Iter idx {iter_idx}, Loss: {average_loss:.4f}, Accuracy: {accuracy:.2f}%')
        crnt_loss = 0
        correct_predictions = 0
        total_predictions = 0
    


### Testing

In [None]:

test_name = 'jinping'
test_name_tensor = nameToTensor(test_name)

rnn_model.eval()
hiddne = rnn_model.get_hidden()
for char_idx in range(len(test_name)):
    char_tensor = test_name_tensor[char_idx]
    output, hidden = rnn_model(char_tensor[None,:],hidden)
predicted_index = torch.argmax(output, 1).item()
print(country_list[predicted_index])
