## Gender Classification Based on Transformer

#### Torch Device 설정

In [2]:
import torch
print(torch.__version__)

if torch.backends.mps.is_available():
  my_device = torch.device('mps')
elif torch.cuda.is_available():
  my_device = torch.device('cuda')
else:
  my_device = torch.device('cpu')

print(my_device)

2.7.0+cu126
cuda


#### Data Loading

In [4]:
import pandas as pd

df = pd.read_csv('../rnn/name_gender_filtered.csv')

unique_chars = set()

for name in df['Name']:
  unique_chars.update(name)

# set to sorted list
sorted_chars = sorted(list(unique_chars))
print(sorted_chars)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


#### stoi, itos dict

In [13]:
# string(char) to index
stoi = {s:i for i, s in enumerate(sorted_chars)}
# padding token
stoi['<P>'] = len(stoi)

# index to string(char)  (with padding token)
itos = {i:s for s, i in stoi.items()}

print(stoi)
print(itos)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, '<P>': 26}
{0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z', 26: '<P>'}


#### encode, decode function

1. encode function : string to indices list : 최대 문자 길이를 맞추기 위해 padding<br>
2. decode function : indices to string list

In [16]:
# maximume word's char length
char_length = 16

# @param {string} name : string name
def encode_name(name):
  encoded_name = [stoi[s] for s in name]
  encoded_name += [stoi['<P>']] * (char_length-len(name))
  return encoded_name

# @param {index list} name : index list
def decode_name(name):
  # remove the padding token
  decoded_name = [itos[i] for i in name if itos[i] != '<P>']
  decoded_name = ''.join(decoded_name)
  return decoded_name


print(encode_name('nocope'))
print(decode_name(encode_name('nocope')))


gen2num = {'F': 0, 'M': 1}
num2gen = {0: 'F', 1: 'M'}

[13, 14, 2, 14, 15, 4, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26]
nocope


#### Transformer for genger classification

In [19]:
n_name_max_length = 16
n_embed_dim = 32
n_multi_heads = 4
n_layers = 4

# charactor dict length
n_char_dict_length = len(stoi)
n_embeddings = n_char_dict_length

print(f"character dict length: {n_char_dict_length}")

character dict length: 27


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from xd_selfattention import XD_TransformerBlock

class XD_GenderTransformer(nn.Module):
  def __init__(self, name_max_length, embed_dim, num_heads, num_layers, num_embeddings, num_classes=2)
    super.__init__()

    self.embed_dim = embed_dim

    # Embedding : 정수 인덱스를 연속적인 임베딩벡터로 변환
    # num_embeddings : 임베딩할 고유 인덱스의 개수 (예: 단언 사전 크기)
    # embed_dim : 각 인덱스를 나타낼 임베딩 벡터 차원
    self.char_embedding = nn.Embedding(num_embeddings, embed_dim)

    # Positional Encoding
    self.positional_encoding = nn.Embedding(name_max_length, embed_dim)

    # Transformer Block : layer 개수 만큼 transform block 을 생성
    self.transformer_blocks = nn.Sequential([XD_TransformerBlock(n_embed_dim, n_multi_heads) for _ in range(n_layers)])

    # last layer Normalization
    self.last_lnorm = nn.LayerNorm(embed_dim)

    # Classifier : embedding dim -> classes count
    self.classifier = nn.Linear(embed_dim, num_classes)

  
  def forword(self, x):
    # name's char embedding vector (with char unit)
    # char_embeddings = [batch_size, seq_length, embed_dim]
    char_embeddings = self.char_embedding(x)  

    # name's char poisition
    # char_positions = [1, name_length] 
    char_positions = torch.arrange(0, x.size(1), device=my_device).unsqueeze(0)








