In [6]:
from argparse import Namespace
import json
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

from vocabulary import Vocabulary

%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (14, 6)

START_TOKEN = "^"
END_TOKEN = "_"

In [2]:
args = Namespace(
    surname_csv="../data/surnames.csv",
    cuda=False,
    num_epochs=100
)

In [4]:
class RawSurnames(object):
    def __init__(self, data_path, delimiter=","):
        self.data = pd.read_csv(data_path, delimiter=delimiter)

    def get_data(self, filter_to_nationality=None):
        if filter_to_nationality is not None:
            return self.data[self.data.nationality.isin(filter_to_nationality)]
        return self.data

# vectorizer

class SurnamesVectorizer(object):
    def __init__(self, surname_vocab, nationality_vocab, max_seq_length):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab
        self.max_seq_length = max_seq_length
        
    def save(self, filename):
        vec_dict = {"surname_vocab": self.surname_vocab.get_serializable_contents(),
                    "nationality_vocab": self.nationality_vocab.get_serializable_contents(),
                    'max_seq_length': self.max_seq_length}

        with open(filename, "w") as fp:
            json.dump(vec_dict, fp)
        
    @classmethod
    def load(cls, filename):
        with open(filename, "r") as fp:
            vec_dict = json.load(fp)

        vec_dict["surname_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["surname_vocab"])
        vec_dict["nationality_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["nationality_vocab"])
        return cls(**vec_dict)

    @classmethod
    def fit(cls, surname_df):
        """
        """
        surname_vocab = Vocabulary(use_unks=False,
                                   use_mask=True,
                                   use_start_end=True,
                                   start_token=START_TOKEN,
                                   end_token=END_TOKEN)

        nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)

        max_seq_length = 0
        for index, row in surname_df.iterrows():
            surname_vocab.add_many(row.surname)
            nationality_vocab.add(row.nationality)

            if len(row.surname) > max_seq_length:
                max_seq_length = len(row.surname)
        max_seq_length = max_seq_length + 2

        return cls(surname_vocab, nationality_vocab, max_seq_length)

    @classmethod
    def fit_transform(cls, surname_df, split='train'):
        vectorizer = cls.fit(surname_df)
        return vectorizer, vectorizer.transform(surname_df, split)

    def transform(self, surname_df, split='train'):

        df = surname_df[surname_df.split==split].reset_index()
        n_data = len(df)
        
        x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)
        y_nationalities = np.zeros(n_data, dtype=np.int64)

        for index, row in df.iterrows():
            vectorized_surname = list(self.surname_vocab.map(row.surname, 
                                                             include_start_end=True))
            x_surnames[index, :len(vectorized_surname)] = vectorized_surname
            y_nationalities[index] = self.nationality_vocab[row.nationality]

        return VectorizedSurnames(x_surnames, y_nationalities)

# vec data


class VectorizedSurnames(Dataset):
    def __init__(self, x_surnames, y_nationalities):
        self.x_surnames = x_surnames
        self.y_nationalities = y_nationalities

    def __len__(self):
        return len(self.x_surnames)

    def __getitem__(self, index):
        return {'x_surnames': self.x_surnames[index],
                'y_nationalities': self.y_nationalities[index],
                'x_lengths': len(self.x_surnames[index].nonzero()[0])}

# data generator

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [8]:
raw_data = RawSurnames(args.surname_csv).get_data()

vectorizer = SurnamesVectorizer.fit(raw_data)

train_dataset = vectorizer.transform(raw_data, split='train')
test_dataset = vectorizer.transform(raw_data, split='test')

## Tasks

1. embed this vector
2. apply convnet to embedded surnames
3. compute prediction vector 



In [15]:
hyperparams = Namespace(
    embedding_dim=64
)

In [19]:
batch_gen = generate_batches(train_dataset, batch_size=8)
batch_dict = next(batch_gen)

In [17]:
embeddings = nn.Embedding(num_embeddings=len(vectorizer.surname_vocab), 
                          embedding_dim=hyperparams.embedding_dim, 
                          padding_idx=0)

In [20]:
embeddings(batch_dict['x_surnames']).shape

torch.Size([8, 22, 64])

In [14]:
train_dataset[1000]['x_surnames']

array([ 1, 27, 26, 37, 25, 20,  5, 20, 18,  2,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0])

In [13]:
"".join(list(vectorizer.surname_vocab.lookup_many(train_dataset[1000]['x_surnames'])))

'^Ingledew_<MASK><MASK><MASK><MASK><MASK><MASK><MASK><MASK><MASK><MASK><MASK><MASK>'