## Name Based Country Classification

### One Hot Encoding 을 위해 사용된 문자셋을 얻음

In [1]:
import pandas as pd
from collections import Counter

df = pd.read_csv('name_country.csv')

# 이름 리스트를 얻어옴
name_data = df['Name'].to_list()
# 국적 데이터를 얻어옴 
country_data = df['Country'].to_list()

# 국적 리스트를 구성 : set 을 이용하여 중복 제거 후 정렬
country_list = sorted(set(country_data))
country_count = len(country_list)
print(f"Country Count: {country_count}, Countries={country_list}")

# 국적 to 인덱스로 변환
country_to_index = {country: i for i, country in enumerate(country_list)}
print(f"Country Index: {country_to_index}")

# collections.Counter를 사용하여 국적별 데이터 수 계산
country_counts = Counter(country_data)
print(sorted(country_counts.items()))

Country Count: 18, Countries=['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']
Country Index: {'Arabic': 0, 'Chinese': 1, 'Czech': 2, 'Dutch': 3, 'English': 4, 'French': 5, 'German': 6, 'Greek': 7, 'Irish': 8, 'Italian': 9, 'Japanese': 10, 'Korean': 11, 'Polish': 12, 'Portuguese': 13, 'Russian': 14, 'Scottish': 15, 'Spanish': 16, 'Vietnamese': 17}
[('Arabic', 2000), ('Chinese', 268), ('Czech', 519), ('Dutch', 297), ('English', 3668), ('French', 277), ('German', 724), ('Greek', 203), ('Irish', 232), ('Italian', 709), ('Japanese', 991), ('Korean', 94), ('Polish', 139), ('Portuguese', 74), ('Russian', 9408), ('Scottish', 100), ('Spanish', 298), ('Vietnamese', 73)]


### Name Character Sets

In [2]:
# one hot encoding 을 위한 문자 집합 생성
unique_chars = set()

# set 집합에 문자열을 추가하면 해당 문자열을 낱개로 쪼개어 각각의 문자들을 하나의 인자로 인식하여 집합에 추가
# 중복된 문자는 추가되지 않음.!!!
for name in name_data:
    unique_chars.update(name)
    if ',' in name:
        print(f"쉼표가 포함된 이름 발견: {name}")

# 문자 집합을 정렬  
unique_chars = sorted(list(unique_chars))
unique_chars = ''.join(unique_chars)
print(f"character count: {len(unique_chars)}, characters={unique_chars}" )

character count: 28, characters= 'abcdefghijklmnopqrstuvwxyz


###  Name to One-Hot Encoded Tensor


In [7]:
import torch

n_letters = len(unique_chars)

def name_to_tensor(name):
    tensor = torch.zeros(len(name), n_letters)
    for i, letter in enumerate(name):
        letter_index = unique_chars.find(letter)
        assert letter_index != -1, "letter not found: " + letter
        tensor[i][letter_index] = 1
    return tensor

### Create a RNN Model

In [4]:
from xd_rnn import XD_RNN

# 은닉층 수
n_hidden = 32
# 입력층 수, 은닉층 수, 출력층 수
rnn_model = XD_RNN(n_letters, n_hidden, country_count)

# 학습률
learning_rate = 0.001
# 학습 횟수
epochs = 200

### Model Trainning

In [5]:
import torch.nn as nn
from torch.optim import Adam, SGD

# 최적화 알고리즘
optimizer = Adam(rnn_model.parameters(), lr=learning_rate)

# 손실 함수
loss_fn = nn.CrossEntropyLoss()

# 모델 학습 설정
rnn_model.train()


# 학습 횟수만큼 반복
for epoch in range(epochs):
    # 데이터 셔플 - reference : https://blog.naver.com/frogsom1120/222127699322
    shuffled_df = df.sample(frac=1).reset_index(drop=True)

    # 데이터 분할은 하지 않음.
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    # 데이터 (rows) 학습 
    for index, row in shuffled_df.iterrows():
        # 이름을 텐서로 변환 (one-hot encoding)
        input_tensor = name_to_tensor(row['Name'])
        # 국적을 텐서로 변환
        target_tensor = torch.tensor([country_list.index(row['Country'])], dtype=torch.long)

        # 모델 은닉층(상태)를 얻어옴
        hidden = rnn_model.get_hidden()

        # 모델 그레디언트 초기화
        rnn_model.zero_grad()

        # rnn 학습
        for char_index in range(input_tensor.size(0)):
            # char tensor 추출 : 2차원 텐서 (1, 28)
            char_tensor = input_tensor[char_index]
            # name char 학습 : 1차원 텐서 (28)
            output, hidden = rnn_model(char_tensor[None, :], hidden)


        # 손실 계산
        loss = loss_fn(output, target_tensor)
        # 손실 역전파
        loss.backward()
        # 최적화 실행
        optimizer.step()

        # 손실 합계 계산
        total_loss += loss.item()

        # 예측 결과 계산
        predicted_index = torch.argmax(output, dim=1)

        # 예측 결과 확인
        correct_predictions += (predicted_index == target_tensor).sum().item()
        total_predictions += 1


    # 평균 손실 계산
    avg_loss = total_loss / total_predictions
    
    # 정확도 계산
    accuracy = 100 * correct_predictions / total_predictions

    # 학습 횟수 출력
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:2f}%")
  
    


Epoch 1/200, Loss: 1.1350, Accuracy: 66.444157%
Epoch 2/200, Loss: 0.8976, Accuracy: 73.288831%
Epoch 3/200, Loss: 0.8155, Accuracy: 75.460795%
Epoch 4/200, Loss: 0.7601, Accuracy: 76.950284%
Epoch 5/200, Loss: 0.7229, Accuracy: 77.946598%
Epoch 6/200, Loss: 0.6948, Accuracy: 78.703796%
Epoch 7/200, Loss: 0.6757, Accuracy: 78.853243%
Epoch 8/200, Loss: 0.6613, Accuracy: 79.505828%
Epoch 9/200, Loss: 0.6461, Accuracy: 79.944206%
Epoch 10/200, Loss: 0.6400, Accuracy: 80.098635%
Epoch 11/200, Loss: 0.6323, Accuracy: 80.342732%
Epoch 12/200, Loss: 0.6230, Accuracy: 80.571884%
Epoch 13/200, Loss: 0.6189, Accuracy: 80.855833%
Epoch 14/200, Loss: 0.6105, Accuracy: 81.005280%
Epoch 15/200, Loss: 0.6101, Accuracy: 81.084986%
Epoch 16/200, Loss: 0.6084, Accuracy: 81.025207%
Epoch 17/200, Loss: 0.6009, Accuracy: 81.194580%
Epoch 18/200, Loss: 0.5990, Accuracy: 81.368935%
Epoch 19/200, Loss: 0.5993, Accuracy: 81.398824%
Epoch 20/200, Loss: 0.5961, Accuracy: 81.533327%
Epoch 21/200, Loss: 0.5927, A

### Testing

In [16]:
test_name = 'jinping'
test_tensor = name_to_tensor(test_name)

rnn_model.eval()

hidden = rnn_model.get_hidden()

for char_index in range(test_tensor.size(0)):
    char_tensor = test_tensor[char_index]
    output, hidden = rnn_model(char_tensor[None, :], hidden)


print (f"Output : {output.item()}")
# 예측 결과 확인
predicted_index = torch.argmax(output, dim=1)
print(country_list[predicted_index.item()])


RuntimeError: a Tensor with 18 elements cannot be converted to Scalar