<a href="https://colab.research.google.com/github/junieberry/NLP-withPyTorch/blob/main/04_MLP_surname/04_MLP_surname_Classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from argparse import Namespace
from collections import Counter
import json
import os
import string

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm

## 4.2 예제: MLP로 성씨 분류하기

- Vocabulary, Vectorizer, Dataloader 클래스를 사용해 성씨 문자열을 벡터의 미니배치로 변환
- surnameclassifier 모델 설계

### 4.2.1 surname dataset

- 18개 국적의 성씨 10,000개
- 불균형함 (최상위 클래스 3개가 전체 데이터의 60% 차지)
- 출신 국가와 성씨 맞춤법 사이의 관계

따라서 전처리 해줌
1. 불균형 줄임

  러시아 성씨를 서브샘플링
2. train, val, test (70, 15, 15)


In [2]:

class SurnameDataset(Dataset):
    def __init__(self, surname_df, vectorizer):
        self.surname_df = surname_df
        self._vectorizer = vectorizer

        self.train_df = self.surname_df[self.surname_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.surname_df[self.surname_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.surname_df[self.surname_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')
        
        # 클래스 가중치
        class_counts = surname_df.nationality.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.nationality_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)

    @classmethod
    def load_dataset_and_make_vectorizer(cls, surname_csv):
        surname_df = pd.read_csv(surname_csv)
        train_surname_df = surname_df[surname_df.split=='train']
        return cls(surname_df, SurnameVectorizer.from_dataframe(train_surname_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, surname_csv, vectorizer_filepath):
        surname_df = pd.read_csv(surname_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(surname_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return SurnameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):

        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        surname_vector = \
            self._vectorizer.vectorize(row.surname)

        nationality_index = \
            self._vectorizer.nationality_vocab.lookup_token(row.nationality)

        return {'x_surname': surname_vector,
                'y_nationality': nationality_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 

    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

### 4.2.2 Vocabulary, Vectorizer, DataLoader

**Vocabulary**

개별 토큰을 정수로 바꿔줌

Vocabulary 클래스 내에는 파이선 딕셔너리 두개

1. 문자 to 인덱스
2. 인덱스 to 문자

`add_token` = Vocabulary에 새 토큰 추가
`lookup_token` = 주어진 토큰에 해당하는 인덱스 반환
`lookup_index` = 주어진 인덱스에 해당하는 토큰 반환

In [3]:
class Vocabulary(object):

  ##token_to_idx (dict): 기존 토큰-인덱스 매핑 딕셔너리
  ## add_unk (bool): UNK 토큰을 추가할지 지정하는 플래그
  ## unk_token (str): Vocabulary에 추가할 UNK 토큰
  def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
    if token_to_idx is None:
      token_to_idx = {}
    
    self._token_to_idx = token_to_idx

    self._idx_to_token = {idx: token 
                          for token, idx in self._token_to_idx.items()}
    
    self._add_unk = add_unk
    self._unk_token = unk_token

    self.unk_index = -1
    if add_unk:
      self.unk_index = self.add_token(unk_token)
  
  ## 직렬화할 수 있는 딕셔너리를 반환
  def to_serializable(self):
    return { 'token_to_idx': self._token_to_idx,
            'add_unk': self._add_unk,
            'unk_token': self._unk_token
    }
  
  @classmethod
  ## 직렬화된 딕셔너리에서 Vocabulary 객체를 만든다.
  def from_serializable(cls, contents):
    return cls(**contents)
  
  ## 토큰 추가하고 매핑 딕셔너리를 업데이트
  ## token (str) == Vocabulary에 추가할 토큰
  ## return == 토큰의 index
  def add_token(self, token):
    if token in self._token_to_idx:
      index = self._token_to_idx[token]
    else:
      index = len(self._token_to_idx)
      self._token_to_idx[token] = index
      self._idx_to_token[index] = token
    return index


  ## 토큰들을 추가하고 매핑 딕셔너리를 업데이트
  ## tokens (list) == 문자열 토큰 리스트
  ## return == 토큰들의 indices
  def add_many(self, tokens):
    return [self.add_token(token) for token in tokens]


  ## 토큰에 대응하는 인덱스 추출, 토큰이 없으면 UNK 인덱스 반환
  ## token (str) == 찾을 토큰
  ## index (int) == 토큰에 해당하는 인덱스
  def lookup_token(self, token):
        
        ## ??????
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

  ## 인덱스에 해당하는 토큰 반환
  ## index (int) == 찾을 인덱스
  ## return = 인덱스의 토큰
  def lookup_index(self, index):
    if index not in self._idx_to_token:
      raise KeyError("Vocabulary에 인덱스(%d)가 없습니다." % index)
    return self._idx_to_token[index]

  def __str__(self):
    return "<Vocabulary(size=%d)>" % len(self)

  def __len__(self):
    return len(self._token_to_idx)



**SurnameVectorizer**

Vocabulary를 적용해 문자열을 벡터로 바꿔줌

근데 공백으로 문자열을 나누지 않고

In [4]:

class SurnameVectorizer(object):
  
  ## surname_vocab (Vocabulary) == 성을 정수에 매핑하는 Vocabulary
  ## nationality_vocab (Vocabulary) == 국가 레이블을 정수에 매핑하는 Vocabulary
    def __init__(self, surname_vocab, nationality_vocab):
 
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab

    ## 매개변수로 받은 성씨를 원핫 벡터로 변환시킴
    ## review (str) == 리뷰
    ## return = 변환된 원핫 벡터
    def vectorize(self, surname):
  
        vocab = self.surname_vocab
        one_hot = np.zeros(len(vocab), dtype=np.float32)
        for token in surname:
            one_hot[vocab.lookup_token(token)] = 1

        return one_hot


    ## 데이터셋 데이터프레임에서 Vectorizer 객체를 만든다
    ## surname_df (pandas.DataFrame) == 성씨 데이터셋
    ## cutoff (int) == 빈도 기반 필터링 설정값
    ## return == SurnameVectorizer 객체
    @classmethod
    def from_dataframe(cls, surname_df):

        surname_vocab = Vocabulary(unk_token="@")
        nationality_vocab = Vocabulary(add_unk=False)

        for index, row in surname_df.iterrows():
            for letter in row.surname:
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)

        return cls(surname_vocab, nationality_vocab)

    @classmethod
    def from_serializable(cls, contents):
        surname_vocab = Vocabulary.from_serializable(contents['surname_vocab'])
        nationality_vocab =  Vocabulary.from_serializable(contents['nationality_vocab'])
        return cls(surname_vocab=surname_vocab, nationality_vocab=nationality_vocab)

    def to_serializable(self):
        return {'surname_vocab': self.surname_vocab.to_serializable(),
                'nationality_vocab': self.nationality_vocab.to_serializable()}

### 4.2.3 SurnameClassifier 모델

1. Linear
2. ReLU
3. Linear
4. Softmax

In [5]:
class SurnameClassifier(nn.Module):

  def __init__(self, input_dim, hidden_dim, output_dim):
    super(SurnameClassifier, self).__init__()
    self.fc1 = nn.Linear(input_dim, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, output_dim)
  
  def forward(self, x_in, apply_softmax = False):

    middle = F.relu(self.fc1(x_in))
    predict = self.fc2(middle)

    if apply_softmax:
      predict = F.softmax(predict, dim=1)
    
    return predict

### 4.2.4 모델 훈련

모델의 출력 종류와 사용하는 손실 함수가 다르다!

이번 예제는 확률로 변환할 수 있는 다중 클래스에 대한 예측 백터 -> CrossEntropyLoss()

In [6]:
cd /content/drive/MyDrive/nlp-with-pytorch/chapter_4/4_2_mlp_surnames

/content/drive/MyDrive/nlp-with-pytorch/chapter_4/4_2_mlp_surnames


In [7]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [8]:

args = Namespace(
    surname_csv="data/surnames/surnames_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="data/model_storage/ch4/surname_mlp",


    # 모델 하이퍼파라미터
    hidden_dim = 300,


    # 훈련 하이퍼파라미터
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,



    # 실행 옵션
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("파일 경로: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# CUDA 체크
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("CUDA 사용여부: {}".format(args.cuda))

# 재현성을 위해 시드 설정
set_seed_everywhere(args.seed, args.cuda)

# 디렉토리 처리
handle_dirs(args.save_dir)

파일 경로: 
	data/model_storage/ch4/surname_mlp/vectorizer.json
	data/model_storage/ch4/surname_mlp/model.pth
CUDA 사용여부: False


In [9]:
## 데이터셋, 모델, 손실, 옵티마이저, 훈련 상태 딕셔너리 만들기


def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

## 훈련 state update

## early stopping
## Model checkpoint

## args (Namespace) == 매개변수
## model (nn.Module) == 훈련할 모델
## train_state (dic) == tratin state의 dictionary

def update_train_state(args, model, train_state):

  ## 초기 상태 == 초기 모델 저장하고 early stopping 값 False로 설정해줌
  if train_state['epoch_index'] == 0:
    torch.save(model.state_dict(), train_state['model_filename'])
    train_state['stop_early'] = False

  ## 훈련중 ...
  elif train_state['epoch_index'] >= 1:
    loss_tm1, loss_t = train_state['val_loss'][-2:]

    ## 손실이 나빠지면 early stopping ++
    if loss_t >= train_state['early_stopping_best_val']:
      train_state['early_stopping_step'] += 1
    ## 손실 좋아지면 모델 저장하고 early stopping = 0
    else:
      if loss_t < train_state['early_stopping_best_val']:
        torch.save(model.state_dict(), train_state['model_filename'])
      
      train_state['early_stopping_step'] = 0
    

    ## early stopping 할거야?
    train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria
  
  return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [10]:
if args.reload_from_files:
    # 체크포인트에서 훈련을 다시 시작
    print("로드해서 만들기")
    dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.surname_csv,
                                                              args.vectorizer_file)
else:
    # 데이터셋과 Vectorizer 만들기
    print("새로 만들기")
    dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.surname_csv)
    dataset.save_vectorizer(args.vectorizer_file)
    
vectorizer = dataset.get_vectorizer()
classifier = SurnameClassifier(input_dim=len(vectorizer.surname_vocab), 
                               hidden_dim=args.hidden_dim, 
                               output_dim=len(vectorizer.nationality_vocab))
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)

    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

train_state = make_train_state(args)

새로 만들기


훈련 시작

In [11]:


# ## 진행바를 위한 함수함수함수들~~~
# epoch_bar = tqdm.notebook.tqdm(desc='training routine', 
#                           total=args.num_epochs,
#                           position=0)

# dataset.set_split('train')
# train_bar = tqdm.notebook.tqdm(desc='split=train',
#                           total=dataset.get_num_batches(args.batch_size), 
#                           position=1, 
#                           leave=True)
# dataset.set_split('val')
# val_bar = tqdm.notebook.tqdm(desc='split=val',
#                         total=dataset.get_num_batches(args.batch_size), 
#                         position=1, 
#                         leave=True)


# ## 에포크 횟수만큼 반복!
# print(args.num_epochs)
# for epoch_index in range(args.num_epochs):
#   train_state['epoch_index'] = epoch_index

#   ## train data의 배치 생성
#   dataset.set_split('train')
#   batch_generator = generate_batches(dataset=dataset,
#                                      batch_size = args.batch_size,
#                                      device=args.device)

#   ## loss와 accuracy 초기화
#   running_loss = 0.0
#   running_acc = 0.0
#   ## 모델이 train 중에 있다 == 파라미터를 바꿀 수 있다
#   classifier.train()

#   ## train data 돌기
#   for batch_index, batch_dict in enumerate(batch_generator):


#     ## 1. Gradient 0으로 초기화
#     optimizer.zero_grad()

#     ## 2. 출력 계산
#     y_pred = classifier(x_in=batch_dict['x_surname'])

#     ## 3. 손실 계산

#     loss = loss_func(y_pred, batch_dict['y_nationality'])
#     loss_t = loss.item()

#     ## ???????????
#     running_loss += (loss_t - running_loss) / (batch_index + 1)

#     ## 4. 손실 기반으로 역전파
#     loss.backward()

#     ## 5. 옵티마이저로 업데이트
#     optimizer.step()

    

#     ## 정확도 계산
#     acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
#     running_acc += (acc_t - running_acc) / (batch_index + 1)

#     # 진행 바 업데이트
#     train_bar.set_postfix(loss=running_loss,
#                           acc=running_acc, 
#                           epoch=epoch_index)
#     train_bar.update()
  

#   train_state['train_loss'].append(running_loss)
#   train_state['train_acc'].append(running_acc)



#   ## ------------------------------------------------------------------




#   dataset.set_split('val')
#   batch_generator = generate_batches(dataset,
#                                      batch_size = args.batch_size,
#                                      device = args.device)
#   running_loss = 0.
#   running_acc = 0.
#   ## train()과 반대로 파라미터 업데이트 못하게 하고 드롭아웃도 없앰
#   ## 손실 계산 안하고 그래디언트 전파 안함?? 아하아하
#   classifier.eval()

#   for batch_index, batch_dict in enumerate(batch_generator):

#     ## 1. 출력 계산
#     y_pred = classifier(x_in=batch_dict['x_surname'])
#     ## 2. 손실 계산
#     loss = loss_func(y_pred, batch_dict['y_nationality'])
#     loss_t = loss.item()
#     running_loss += (loss_t - running_loss) / (batch_index + 1)
#     ## 3. 정확도 계산
#     acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
#     running_acc += (acc_t - running_acc) / (batch_index + 1)

#     ## 진행바 업데이트
#     val_bar.set_postfix(loss=running_loss,
#                         acc=running_acc,
#                         epoch=epoch_index)
#     val_bar.update()

#   train_state['val_loss'].append(running_loss)
#   train_state['val_acc'].append(running_acc)

#   train_state = update_train_state(args=args, model=classifier,
#                                    train_state=train_state)
  
#   scheduler.step(train_state['val_loss'][-1])

#   train_bar.n = 0
#   val_bar.n = 0
#   epoch_bar.update()

#   if train_state['stop_early']:
#     break

#   train_bar.n = 0
#   val_bar.n = 0
#   epoch_bar.update()



In [12]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)

    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm.notebook.tqdm(desc='training routine', 
                               total=args.num_epochs,
                               position=0)

dataset.set_split('train')
train_bar = tqdm.notebook.tqdm(desc='split=train',
                               total=dataset.get_num_batches(args.batch_size), 
                               position=1, 
                               leave=True)
dataset.set_split('val')
val_bar = tqdm.notebook.tqdm(desc='split=val',
                             total=dataset.get_num_batches(args.batch_size), 
                             position=1, 
                             leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # 훈련 세트에 대한 순회

        # 훈련 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # 훈련 과정은 5단계로 이루어집니다

            # --------------------------------------
            # 단계 1. 그레이디언트를 0으로 초기화합니다
            optimizer.zero_grad()

            # 단계 2. 출력을 계산합니다
            y_pred = classifier(batch_dict['x_surname'])

            # 단계 3. 손실을 계산합니다
            loss = loss_func(y_pred, batch_dict['y_nationality'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # 단계 4. 손실을 사용해 그레이디언트를 계산합니다
            loss.backward()

            # 단계 5. 옵티마이저로 가중치를 업데이트합니다
            optimizer.step()
            # -----------------------------------------

            # 정확도를 계산합니다
            acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # 진행 바 업데이트
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # 검증 세트에 대한 순회

        # 검증 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # 단계 1. 출력을 계산합니다
            y_pred =  classifier(batch_dict['x_surname'])

            # 단계 2. 손실을 계산합니다
            loss = loss_func(y_pred, batch_dict['y_nationality'])
            loss_t = loss.to("cpu").item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # 단계 3. 정확도를 계산합니다
            acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/60 [00:00<?, ?it/s]

split=val:   0%|          | 0/12 [00:00<?, ?it/s]

### 4.2.5 모델 평가와 예측



1. 테스트 데이터로 평가하기

In [13]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산
classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # 출력을 계산합니다
    y_pred =  classifier(batch_dict['x_surname'])
    
    # 손실을 계산합니다
    loss = loss_func(y_pred, batch_dict['y_nationality'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # 정확도를 계산합니다
    acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("테스트 손실: {};".format(train_state['test_loss']))
print("테스트 정확도: {}".format(train_state['test_acc']))

테스트 손실: 1.8054201006889343;
테스트 정확도: 44.85677083333333


2. 새로운 성씨 예측하기

In [14]:
def predict_nationality(surname, classifier, vectorizer):

    vectorized_surname = vectorizer.vectorize(surname)
    vectorized_surname = torch.tensor(vectorized_surname).view(1, -1)
    result = classifier(vectorized_surname, apply_softmax=True)

    probability_values, indices = result.max(dim=1)
    index = indices.item()

    predicted_nationality = vectorizer.nationality_vocab.lookup_index(index)
    probability_value = probability_values.item()

    return {'nationality': predicted_nationality, 'probability': probability_value}

In [15]:

# new_surname = input("분류하려는 성씨를 입력하세요: ")
new_surname = "McMahan"
classifier = classifier.to("cpu")
prediction = predict_nationality(new_surname, classifier, vectorizer)
print("{} -> {} (p={:0.2f})".format(new_surname,
                                    prediction['nationality'],
                                    prediction['probability']))

McMahan -> Irish (p=0.37)


3. 최상위 K개 예측

In [16]:
def predict_topk_nationality(name, classifier, vectorizer, k=5):
    """새로운 성씨에 대한 최상위 K개 국적을 예측합니다
    
    매개변수:
        surname (str): 분류하려는 성씨
        classifier (SurnameClassifer): 분류기 객체
        vectorizer (SurnameVectorizer): SurnameVectorizer 객체
        k (int): the number of top nationalities to return
    반환값:
        딕셔너리 리스트, 각 딕셔너리는 국적과 확률로 구성됩니다.
    """
    vectorized_name = vectorizer.vectorize(name)
    vectorized_name = torch.tensor(vectorized_name).view(1, -1)
    prediction_vector = classifier(vectorized_name, apply_softmax=True)
    probability_values, indices = torch.topk(prediction_vector, k=k)
    
    # 반환되는 크기는 (1,k)입니다
    probability_values = probability_values.detach().numpy()[0]
    indices = indices.detach().numpy()[0]
    
    results = []
    for prob_value, index in zip(probability_values, indices):
        nationality = vectorizer.nationality_vocab.lookup_index(index)
        results.append({'nationality': nationality, 
                        'probability': prob_value})
    
    return results


new_surname = input("분류하려는 성씨를 입력하세요: ")
classifier = classifier.to("cpu")

k = int(input("얼마나 많은 예측을 보고 싶나요? "))
if k > len(vectorizer.nationality_vocab):
    print("앗! 전체 국적 개수보다 큰 값을 입력했습니다. 모든 국적에 대한 예측을 반환합니다. :)")
    k = len(vectorizer.nationality_vocab)
    
predictions = predict_topk_nationality(new_surname, classifier, vectorizer, k=k)

print("최상위 {}개 예측:".format(k))
print("===================")
for prediction in predictions:
    print("{} -> {} (p={:0.2f})".format(new_surname,
                                        prediction['nationality'],
                                        prediction['probability']))

분류하려는 성씨를 입력하세요: hihi
얼마나 많은 예측을 보고 싶나요? 3
최상위 3개 예측:
hihi -> Chinese (p=0.32)
hihi -> Vietnamese (p=0.25)
hihi -> Korean (p=0.19)
