## Build the model

[Lee et al 2017](https://www.ijcai.org/proceedings/2017/0289.pdf) architecture

<img src="images/architecture.png" alt="drawing" width="450"/>

In [2]:
import os
import numpy as np
import pandas as pd
import pickle
import gensim

from nltk import ngrams
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Flatten, LSTM, Dense
from keras.layers.merge import concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.metrics import top_k_categorical_accuracy

from sklearn.model_selection import train_test_split
from itertools import chain

from gensim.models import Word2Vec

In [3]:
def initializer(sequences, ngram2idx, emb_dim):
    sequences = [list((str(idx) for idx in name)) for name in sequences]
    model = Word2Vec(sequences, size=emb_dim, window=5, min_count=0, iter=10)
    init = np.zeros((len(ngram2idx), emb_dim), dtype=np.float32)
    for ngram, idx in ngram2idx.items():
        init[idx] = model[str(idx)]
    return init


def top_k_mod(y_true, y_pred, k=3):
    return top_k_categorical_accuracy(y_true, y_pred, k)

In [5]:
data_dir = 'data_processed/'
name2country = pickle.load(
    open(os.path.join(data_dir, 'name2country.p'), 'rb'))
country2idx = pickle.load(
    open(os.path.join(data_dir, 'country2idx.p'), 'rb'))
unigram2idx = pickle.load(
    open(os.path.join(data_dir, 'unigram2idx.p'), 'rb'))
bigram2idx = pickle.load(
    open(os.path.join(data_dir, 'bigram2idx.p'), 'rb'))
trigram2idx = pickle.load(
    open(os.path.join(data_dir, 'trigram2idx.p'), 'rb'))

In [8]:
# we need to preserve order, so dictionaries are not good...
tmp = list(name2country.items())
tmp = sorted(tmp, key=lambda tmp: tmp[0])
all_names, all_countries = [], []
for n, c in tmp:
    all_names.append(n)
    all_countries.append(c)

In [9]:
# Build corpus of ngrams with n=1,2,3
unig_corpus = [list((''.join(ng) for ng in ngrams(name, 1)))
               for name in all_names]
bigr_corpus = [list((''.join(ng) for ng in ngrams(name, 2)))
               for name in all_names]
trig_corpus = [list((''.join(ng) for ng in ngrams(name, 3)))
               for name in all_names]

In [13]:
# Numerical-encoded sequences
unig_seq = [list(unigram2idx[gram] for gram in name)
            for name in unig_corpus]
bigr_seq = [list(bigram2idx[gram] for gram in name)
            for name in bigr_corpus]
trig_seq = [list(trigram2idx[gram] for gram in name)
            for name in trig_corpus]

In [15]:
# We will consider 30 characters per name. We pad sorter names
MAX_SEQUENCE_LENGTH = 30
unig_X = np.vstack(pad_sequences(unig_seq, MAX_SEQUENCE_LENGTH))
bigr_X = np.vstack(pad_sequences(bigr_seq, MAX_SEQUENCE_LENGTH))
trig_X = np.vstack(pad_sequences(trig_seq, MAX_SEQUENCE_LENGTH))