<a href="https://colab.research.google.com/github/jejae3372/Colab_AI/blob/main/%08Surname_Classifying.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from argparse import Namespace
from collections import Counter
import json
import os
import string
import re
import collections

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm

In [45]:
args = Namespace(
    raw_dataset_csv="/content/surnames.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="/content/surnames_with_splits.csv",
    seed=1337
)

In [46]:
surnames = pd.read_csv(args.raw_dataset_csv, header=0)

In [47]:
surnames.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [48]:
set(surnames.nationality)

{'Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese'}

In [49]:
by_nationality = collections.defaultdict(list)
for _, row in surnames.iterrows():
  by_nationality[row.nationality].append(row.to_dict())

In [50]:
#국적별 성을 item_list로 뽑고 그 안에서 비율을 통해 각각 split
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_nationality.items()):
  np.random.shuffle(item_list)
  n = len(item_list)
  n_train = int(args.train_proportion*n)
  n_val = int(args.val_proportion*n)
  n_test = int(args.test_proportion*n)

  for item in item_list[:n_train]:
    item['split'] = 'train'
  for item in item_list[n_train:n_train+n_val]:
    item['split'] = 'val'
  for item in item_list[n_train+n_val:]:
    item['split'] = 'test'

  final_list.extend(item_list)

In [51]:
final_surnames = pd.DataFrame(final_list)

In [52]:
final_surnames.split.value_counts()

train    7680
test     1660
val      1640
Name: split, dtype: int64

In [53]:
final_surnames.head()

Unnamed: 0,surname,nationality,split
0,Totah,Arabic,train
1,Abboud,Arabic,train
2,Fakhoury,Arabic,train
3,Srour,Arabic,train
4,Sayegh,Arabic,train


In [55]:
final_surnames.to_csv(args.output_munged_csv, index=False)

#Vocabulary

In [None]:
class Vocabulary(object):

  def __init__(self, token_to_idx = None, add_unk = True, unk_token = "<UNK>"):
    if token_to_idx is None:
      token_to_idx = {}
    self.token_to_idx = token_to_idx
    self._idx_to_token = {idx: token
                         for token, idx in self._token_to_idx.items()}
    self._add_unk = add_unk
    self._unk_token = unk_token

    self.unk_index = -1
    if add_unk:
      self.unk_index = self.add_token(unk_token)

  def to_serializable(self):
    return {'token_to_idx': self._token_to_idx,
            'add_unk':self._add_unk,
            'unk_token':self._unk_token}

  @classmethod
  def from_serializable(cls, contents):
    return cls(**contents)

  def add_token(self, token):
    try:
      index = self._token_to_idx[token]
    except KeyError:
      index = len(self._token_to_idx)
      self._token_to_idx[token] = index
      self._idx_to_token[index] = token
    return index

  def add_many(eslf, tokens):
    return [self.add_token(token) for token in tokens]

  def lookup_token(self, token):
    if self.unk_index >= 0:
      return self._token_to_idx.get(token, self.unk_index)
    else:
      return self._token_to_idx[token]

  def lookup_index(self, index):
    if index not in self._idx_to_token:
      raise KeyError("Vocabulary에 인덱스(%d)가 없습니다." % index)
    return self._idx_to_token[index]

  def __str__(self):
    return "<Vocalbulary(size = %d)>" % len(self)

  def __len__(self):
    return len(self._token_to_idx)

#Vectorizer

In [None]:
class SurnameVectorizer(object):
  def __init__(self, surname_vocab, nationality_vocab):
    self.surname_vocab = surname_vocab  # 단어를 정수에 매핑하는 vocab
    self.nationality_vocab = nationality_vocab  # 클래스 레이블을 정수에 매핑하는 vocab

  def vectorize(self, surname): #ont-hot encoding
    one_hot = np.zeros(len(self.surname_vocab), dtype=np.float32)

    for token surname:
      one_hot[self.surname_vocab.lookup_token(token)] = 1  #lookup_token 을 이용해 해당 token의 index 만 1

    return one_hot

  @classmethod
  def from_dataframe(cls, surname_df):
    surname_vocab = Vocabulary(unk_token = "@")
    nationality_vocab = Vocabulary(add_unk = False)

    for index, row in surname_df.iterrows():
      for letter in row.surname:
        surname_vocab.add_token(letter)
      nationality_vocab.add_token(row.nationality)

    return cls(surname_vocab, nationality_vocab)

  @classmethod
  def from_serializable(cls, contents):
    surname_vocab = Vocabulary.from_serializable(contents['surname_vocab'])
    nationality_vocab = Vocabulary.from_serializable(contents['nationality_vocab'])
    return cls(surname_vocab = surname_vocab, nationality_vocab = nationality_vocab)

  def to_serializable(self):
    return {'surname_vocab' : self.surname_vocab.to_serializable(),
            'nationality_vocab' : self.nationality_vocab.to_serializable()}

#DataSet

In [None]:
class SurnameDataset(Dataset):
  def __init__(self, surname_df, vectorizer):
    self.surname_df = surname_df
    self.vectorizer = vectorizer

    self.train_df = self.surname_df[self.surname_df.split == 'train']
    self.train_size = len(self.train_df)

    self.val_df = self.surname_df[self.surname_df.split == 'val']
    self.val_size = len(self.val_df)

    self.test_df = self.surname_df[self.surname_df.split == 'test']
    self.test_size = len(self.test_df)

    self.lookup_dict = {'train': (self.train_df, self.train_size),
                        'val': (self.val_df, self.val_size),
                        'test' : (self.test_df, self.test_size)}
    self.set_split('train')

    #클래스 가중치
    class_counts = surname_df.nationality.value_counts().to_dict()
    def sort_key(item):
      return self._vectorizer.nationality_vocab.lookup_token(item[0])
    sorted_counts = sorted(class_counts.items(), key = sort_key)
    frequencies = [count for _, count in sorted_counts]
    self.class_weights = 1.0 / torch.tensor(frequencies, dtype = torch.float32)

  @classmethod
  def load_dataset_and_make_vectorizer(cls, surname_csv):
    surname_df = pd.read_csv(review_csv)
    train_surname_df = surname_df[surname_df.split == 'train']
    return cls(surname_df, SurnameVectorizer.from_dataframe(train_surname_df))

  def get_vectorizer(self):
    return self._vectorizer

  def set_split(self, split = "train"):
    self._target_split = split
    self._target_df, self._target_size = self._lookup_dict[split]

  def _len_(self):
    return self._target_size

  def _getitem_(self, index):
    row = self._target_df.iloc[index]
    review_vector = \
      self._vectorizer.vectorize(row.review)
    rating_index = \
      self.vectorizer.rating_vocab.lookup_token(row.rating)

    return {'x_data' : review_vector,
            'y_target' : rating_index}

  def get_num_batches(self, batch_size):
    return len(self) // batch_size

#Classifier

In [None]:
class SurnameClassifier(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim):
    super(SurnameClassifier, self).__init__()
    self.fc1 = nn.Linear