In [111]:
from argparse import Namespace
from collections import Counter
import json
import os
import string

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from tqdm.notebook import tqdm

In [112]:
class Vocabulary(object):

    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):

        if token_to_idx is None:
            token_to_idx = {}

        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx:token for token, idx in self._token_to_idx.items()}

        self._add_unk = add_unk
        self._unk_token = unk_token

        self.unk_index = -1

        if add_unk:
            self.unk_index = self.add_token(unk_token)

    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        cls(**contents)

    def add_token(self, token):

        try:
            index = self._token_to_idx(token)
        except:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def add_many(self, tokens):

        return {self.add_token(token) for token in tokens}

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)







In [113]:
class SurnameVectorizer(object):

    def __init__(self, surname_vocab, nationality_vocab):

        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab

    def vectorize(self, surname:str):
        vocab = self.surname_vocab
        one_hot = np.zeros(len(vocab), dtype=np.float32)
        for token in surname:
           one_hot[vocab.lookup_token(token)] = 1

        return one_hot

    @classmethod
    def from_dataframe(cls, surname_df):
        surname_vocab = Vocabulary(unk_token="@")
        nationality_vocab = Vocabulary(add_unk=False)

        for index, row in surname_df.iterrows():
            for letter in row.surname:
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)

        return cls(surname_vocab, nationality_vocab)

    @classmethod
    def from_serializable(cls, contents):
        surname_vocab = Vocabulary.from_serializable(contents['surname_vocab'])
        nationality_vocab = Vocabulary.from_serializable(contents['nationality_vocab'])
        return cls(surname_vocab, nationality_vocab)


    def to_serializable(self):
        return {'surname_vocab': self.surname_vocab.to_serializable(),
                'nationality_vocab': self.nationality_vocab.to_serializable()}



In [114]:
class SurnameDataset(Dataset):
    def __init__(self, surname_df, vectorizer):

        self.surname_df = surname_df
        self._vectorizer = vectorizer

        self.train_df = self.surname_df[self.surname_df.split == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.surname_df[self.surname_df.split == 'val']
        self.validation_size = len(self.val_df)

        self.test_df = self.surname_df[self.surname_df.split == 'test']

        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

        class_counts = surname_df.nationality.value_counts().to_dict() #每个国家的count

        def sort_key(item): #sorted by vocabulary index
            return self._vectorizer.nationality_vocab.lookup_token(item[0]) #return index

        self.sorted_class = sorted(class_counts.items(), key=sort_key) #sort 每个国家的count 根据vocabulary 的index
        frequencies = [count for _, count in self.sorted_class]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32) #每个国家的占比

    @classmethod
    def load_dataset_and_make_vectorizer(cls, surname_csv):
        surname_df = pd.read_csv(surname_csv)
        train_surname_df = surname_df[surname_df.split == 'train']
        return cls(surname_df, SurnameVectorizer.from_dataframe(train_surname_df))
    @classmethod
    def load_dataset_and_load_vectorizer(cls, surname_csv, vectorizer_filepath):
        surname_df = pd.read_csv(surname_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(surname_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):

        with open(vectorizer_filepath) as fp:
            return SurnameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):

        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(),fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split='train'):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):

        row = self._target_df.iloc[index]

        surname_vector = self._vectorizer.vectorize(row.surname)
        nationality_index = self._vectorizer.nationality_vocab.lookup_token(row.nationality)

        return {'x_surname':surname_vector, 'y_nationality': nationality_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):

    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict



The Model

In [115]:
class SurnameClassifier(nn.Module):

    def __init__(self, input_dim, hidden_dim, out_dim):
        super(SurnameClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, x_in, apply_softmax=False):
        intermediate_vector = self.fc1(x_in)
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector



### Training Loop

#### Helper function

In [116]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step':0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    elif train_state['epoch_index'] >= 1:
        """
        loss变差early stoping stop 增加
        loss减少， 如果低于early stopping best val 保存模型 重置early stopping step
        loss变差一定次数就停止
        """

        loss_tm1, loss_t = train_state['val_loss'][-2:] #最后两个

        if loss_t >= train_state['early_stopping_best_val']: #if loss  worsened
            #update step
            train_state['early_stopping_step'] += 1
        else: #loss decrease
            if loss_t <= train_state['early_stopping_best_val']: #save the best model
                torch.save(model.state_dict(), train_state['model_filename'])

            train_state['early_stopping_step'] = 0 #reset early stopping step

        #stop early? 如果变差一定次数就停止
        train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):

    _, y_pred_indices = y_pred.max(dim=1) #第一个是value 第二个是index
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100






##### general utility

In [117]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)


In [118]:
args =Namespace(
    surname_csv="surnames_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="/home/hc/TORCH_TUTOR/surname",
    hidden_dim=300,
    seed=1337,
    num_epochs=100,
    early_stopping_criteria=5,
    learning_rate=0.001,
    batch_size=64,
    cuda=False,
    reload_from_file=False,
    expand_filepaths_to_save_dir=True,
)
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

#check cuda
if torch.cuda.is_available():
    args.cuda = True


args.device = torch.device("cuda" if args.cuda else "cpu" )
handle_dirs(args.save_dir)

set_seed_everywhere(args.seed, args.cuda)

Expanded filepaths: 
	/home/hc/TORCH_TUTOR/surname/vectorizer.json
	/home/hc/TORCH_TUTOR/surname/model.pth


### Initializations

In [119]:
if args.reload_from_file:
    print("Reloading")
    dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.surname_csv, args.vectorizer_file)
else:
    print("Creating fresh")
    dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.surname_csv)
    dataset.save_vectorizer(args.vectorizer_file)

vectorizer = dataset.get_vectorizer()
classifier = SurnameClassifier(input_dim=len(vectorizer.surname_vocab), hidden_dim=args.hidden_dim, out_dim=len(vectorizer.nationality_vocab))

Creating fresh


### Training Loop


In [125]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)


print(dataset.class_weights)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', total=args.num_epochs,position=0)

dataset.set_split('train')

train_bar

[('Arabic', 1603), ('Chinese', 220), ('Czech', 414), ('Dutch', 236), ('English', 2972), ('French', 229), ('German', 576), ('Greek', 156), ('Irish', 183), ('Italian', 600), ('Japanese', 775), ('Korean', 77), ('Polish', 120), ('Portuguese', 55), ('Russian', 2373), ('Scottish', 75), ('Spanish', 258), ('Vietnamese', 58)]
tensor([0.0006, 0.0045, 0.0024, 0.0042, 0.0003, 0.0044, 0.0017, 0.0064, 0.0055,
        0.0017, 0.0013, 0.0130, 0.0083, 0.0182, 0.0004, 0.0133, 0.0039, 0.0172],
       device='cuda:0')


In [85]:

data = pd.read_csv("surnames_with_splits.csv")
class_count = data.nationality.value_counts().to_dict()
vectorizer = SurnameVectorizer.from_dataframe(data)
vectorizer.nationality_vocab
def sort_key(item):
    return vectorizer.nationality_vocab.lookup_token(item[0])
sort_count = sorted(class_count.items(),key=sort_key)

frequenies = [count for _, count in sort_count]
1/ torch.tensor(frequenies, dtype=torch.float32)

tensor([0.0006, 0.0045, 0.0024, 0.0042, 0.0003, 0.0044, 0.0017, 0.0064, 0.0055,
        0.0017, 0.0013, 0.0130, 0.0083, 0.0182, 0.0004, 0.0133, 0.0039, 0.0172])

In [49]:
vectorizer.nationality_vocab._idx_to_token.items()

dict_items([(0, 'Arabic'), (1, 'Chinese'), (2, 'Czech'), (3, 'Dutch'), (4, 'English'), (5, 'French'), (6, 'German'), (7, 'Greek'), (8, 'Irish'), (9, 'Italian'), (10, 'Japanese'), (11, 'Korean'), (12, 'Polish'), (13, 'Portuguese'), (14, 'Russian'), (15, 'Scottish'), (16, 'Spanish'), (17, 'Vietnamese'), (18, 'Vietnamese')])

In [42]:
def sort_key(item):
    return vectorizer.nationality_vocab.lookup_token(item[0])
sorted(class_count.items(),key=sort_key)


[('Arabic', 1603),
 ('Chinese', 220),
 ('Czech', 414),
 ('Dutch', 236),
 ('English', 2972),
 ('French', 229),
 ('German', 576),
 ('Greek', 156),
 ('Irish', 183),
 ('Italian', 600),
 ('Japanese', 775),
 ('Korean', 77),
 ('Polish', 120),
 ('Portuguese', 55),
 ('Russian', 2373),
 ('Scottish', 75),
 ('Spanish', 258),
 ('Vietnamese', 58)]

In [126]:
batch_generator = generate_batches(dataset=dataset, batch_size=4,shuffle=True,drop_last=True)
for batch_index, batch_dict in enumerate(batch_generator):
    print(batch_index)

IndexError: index 77 is out of bounds for axis 0 with size 77