## Gender Classification

### One-Hot Encoding 을 위한 문자셋을 얻음

In [1]:
import pandas as pd

# 이름 성별 데이터 불러오기
df = pd.read_csv('name_gender_filtered.csv')

# one hot encoding 을 위한 문자 집합 생성
unique_chars = set()

# set 집합에 문자열을 추가하면 해당 문자열을 낱개로 쪼개어 각각의 문자들을 하나의 인자로 인식하여 집합에 추가
# 중복된 문자는 추가되지 않음.!!!
for name in df['Name']:
    unique_chars.update(name)

# 문자 집합을 정렬  
unique_chars = sorted(list(unique_chars))
unique_chars = ''.join(unique_chars)
print(unique_chars)

abcdefghijklmnopqrstuvwxyz


###  Name to One-Hot Encoded Tensor


In [2]:
import torch

n_letters = len(unique_chars)

def name_to_tensor(name):
    tensor = torch.zeros(len(name), n_letters)
    for i, letter in enumerate(name):
        letter_index = unique_chars.find(letter)
        assert letter_index != -1, "letter not found: " + letter
        tensor[i][letter_index] = 1
    return tensor


# 성별을 숫자(인덱스)로 변환
gen2num = {'F': 0, 'M': 1}
# 숫자(인덱스)를 성별로 변환
num2gen = {0: 'F', 1: 'M'}


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/hyunious/opt/anaconda3/envs/py312/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/hyunious/opt/anaconda3/envs/py312/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/hyunious/opt/anaconda3/envs/py312/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739,

### Create a RNN Model

In [3]:
from xd_rnn import XD_RNN

# 은닉층 수
n_hidden = 128
# 입력층 수, 은닉층 수, 출력층 수
rnn_model = XD_RNN(n_letters, n_hidden, 2)

# 학습률
learning_rate = 0.0001
# 학습 횟수
epochs = 200

### Model Trainning

In [4]:
import torch.nn as nn
from torch.optim import Adam, SGD

# 최적화 알고리즘
optimizer = Adam(rnn_model.parameters(), lr=learning_rate)

# 손실 함수
loss_fn = nn.CrossEntropyLoss()

# 모델 학습 설정
rnn_model.train()


# 학습 횟수만큼 반복
for epoch in range(epochs):
    # 데이터 셔플 - reference : https://blog.naver.com/frogsom1120/222127699322
    shuffled_df = df.sample(frac=1).reset_index(drop=True)

    # 데이터 분할은 하지 않음.
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    # 데이터 (rows) 학습 
    for index, row in shuffled_df.iterrows():
        # 이름을 텐서로 변환 (one-hot encoding)
        input_tensor = name_to_tensor(row['Name'])
        # 성별을 텐서로 변환
        target_tensor = torch.tensor([gen2num[row['Gender']]], dtype=torch.long)

        # 모델 은닉층(상태)를 얻어옴
        hidden = rnn_model.get_hidden()

        # 모델 그레디언트 초기화
        rnn_model.zero_grad()

        # rnn 학습
        for char_index in range(input_tensor.size(0)):
            # char tensor 추출 : 2차원 텐서 (1, 26)
            char_tensor = input_tensor[char_index]
            # name char 학습 : 1차원 텐서 (26)
            output, hidden = rnn_model(char_tensor[None, :], hidden)


        # 손실 계산
        loss = loss_fn(output, target_tensor)
        # 손실 역전파
        loss.backward()
        # 최적화 실행
        optimizer.step()

        # 손실 합계 계산
        total_loss += loss.item()

        # 예측 결과 계산
        predicted_index = torch.argmax(output, dim=1)

        # 예측 결과 확인
        correct_predictions += (predicted_index == target_tensor).sum().item()
        total_predictions += 1


    # 평균 손실 계산
    avg_loss = total_loss / total_predictions
    
    # 정확도 계산
    accuracy = 100 * correct_predictions / total_predictions

    # 학습 횟수 출력
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:2f}%")
  
    


Epoch 1/500, Loss: 0.4253, Accuracy: 80.269083%
Epoch 2/500, Loss: 0.3805, Accuracy: 83.225736%
Epoch 3/500, Loss: 0.3764, Accuracy: 83.551240%
Epoch 4/500, Loss: 0.3752, Accuracy: 83.670591%
Epoch 5/500, Loss: 0.3735, Accuracy: 83.572940%
Epoch 6/500, Loss: 0.3732, Accuracy: 83.735691%
Epoch 7/500, Loss: 0.3721, Accuracy: 83.632615%
Epoch 8/500, Loss: 0.3719, Accuracy: 83.936418%
Epoch 9/500, Loss: 0.3719, Accuracy: 83.811642%
Epoch 10/500, Loss: 0.3706, Accuracy: 84.061195%
Epoch 11/500, Loss: 0.3716, Accuracy: 83.817067%
Epoch 12/500, Loss: 0.3704, Accuracy: 83.898443%
Epoch 13/500, Loss: 0.3704, Accuracy: 84.120870%
Epoch 14/500, Loss: 0.3704, Accuracy: 83.827917%
Epoch 15/500, Loss: 0.3702, Accuracy: 83.860468%
Epoch 16/500, Loss: 0.3688, Accuracy: 84.050344%
Epoch 17/500, Loss: 0.3690, Accuracy: 84.115445%
Epoch 18/500, Loss: 0.3686, Accuracy: 83.952694%
Epoch 19/500, Loss: 0.3681, Accuracy: 83.979819%
Epoch 20/500, Loss: 0.3674, Accuracy: 83.887593%
Epoch 21/500, Loss: 0.3664, A

KeyboardInterrupt: 

### Testing

In [7]:
test_name = 'elsa'
test_tensor = name_to_tensor(test_name)

rnn_model.eval()

hidden = rnn_model.get_hidden()

for char_index in range(test_tensor.size(0)):
    char_tensor = test_tensor[char_index]
    output, hidden = rnn_model(char_tensor[None, :], hidden)


# 예측 결과 확인
predicted_index = torch.argmax(output, dim=1)
print(num2gen[predicted_index.item()])


F
