In [1]:
import pandas as pd

df = pd.read_csv('train3.csv')

df.shape

(110692, 2)

In [2]:
df.sample(100)

Unnamed: 0,Names,Nationality
89811,Sasalak Haiprakhon,Thai
64915,Hsin Ping,Taiwanese
72147,Pyotra Krecheuski,Belarusian
109035,Mathews Phosa,Sotho
46349,Kristína Royová,Slovak
...,...,...
104211,Erdenebatyn Bekhbayar,Mongolian
54689,Park Sa-rang,Korean
17767,Géza Gyimóthy,Hungarian
89320,Mawin Maneewong,Thai


In [3]:
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'Nationality'.
df['Nationality']= label_encoder.fit_transform(df['Nationality'])

In [4]:
# define input
X = df.Names
# define labels
y = df.Nationality

In [5]:
from sklearn.model_selection import train_test_split
# set aside 10% of the dataset for test
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.1)
# inspect resulting shapes
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((99622,), (99622,), (11070,), (11070,))

In [6]:
#NUMPY

from __future__ import print_function
import inflect
import re



_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+')


def _remove_commas(m):
    return m.group(1).replace(',', '')


def _expand_decimal_point(m):
    return m.group(1).replace('.', ' point ')


def _expand_dollars(m):
    match = m.group(1)
    parts = match.split('.')
    if len(parts) > 2:
        return match + ' dollars'    # Unexpected format
    dollars = int(parts[0]) if parts[0] else 0
    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
    if dollars and cents:
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
    elif dollars:
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        return '%s %s' % (dollars, dollar_unit)
    elif cents:
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s' % (cents, cent_unit)
    else:
        return 'zero dollars'


def _expand_ordinal(m):
    return _inflect.number_to_words(m.group(0))


def _expand_number(m):
    num = int(m.group(0))
    if num > 1000 and num < 3000:
        if num == 2000:
            return 'two thousand'
        elif num > 2000 and num < 2010:
            return 'two thousand ' + _inflect.number_to_words(num % 100)
        elif num % 100 == 0:
            return _inflect.number_to_words(num // 100) + ' hundred'
        else:
            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
    else:
        return _inflect.number_to_words(num, andword='')


def normalize_numbers(text):
    text = re.sub(_comma_number_re, _remove_commas, text)
    text = re.sub(_pounds_re, r'\1 pounds', text)
    text = re.sub(_dollars_re, _expand_dollars, text)
    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
    text = re.sub(_ordinal_re, _expand_ordinal, text)
    text = re.sub(_number_re, _expand_number, text)
    return text

In [7]:
#G2P

f = open("homograph.txt")
homograph = dict()
for line in f.readlines()[10:]:
  if line.startswith("#"):continue
  l = line.split("|")
  print(l)
  homograph[l[0].lower()] = (l[1].split(), l[2].split(), l[3].replace("\\\n", ""))
print(len(homograph))

['ABSENT', 'AH1 B S AE1 N T', 'AE1 B S AH0 N T', 'V\\\n']
['ABSTRACT', 'AE0 B S T R AE1 K T', 'AE1 B S T R AE2 K T', 'V\\\n']
['ABSTRACTS', 'AE0 B S T R AE1 K T S', 'AE1 B S T R AE0 K T S', 'V\\\n']
['ABUSE', 'AH0 B Y UW1 Z', 'AH0 B Y UW1 S', 'V\\\n']
['ABUSES', 'AH0 B Y UW1 Z IH0 Z', 'AH0 B Y UW1 S IH0 Z', 'V\\\n']
['ACCENT', 'AH0 K S EH1 N T', 'AE1 K S EH2 N T', 'V\\\n']
['ACCENTS', 'AE1 K S EH0 N T S', 'AE1 K S EH0 N T S', 'V\\\n']
['ADDICT', 'AH0 D IH1 K T', 'AE1 D IH2 K T', 'V\\\n']
['ADDICTS', 'AH0 D IH1 K T S', 'AE1 D IH2 K T S', 'V\\\n']
['ADVOCATE', 'AE1 D V AH0 K EY2 T', 'AE1 D V AH0 K AH0 T', 'V\\\n']
['ADVOCATES', 'AE1 D V AH0 K EY2 T S', 'AE1 D V AH0 K AH0 T S', 'V\\\n']
['AFFECT', 'AH0 F EH1 K T', 'AE1 F EH0 K T', 'V\\\n']
['AFFECTS', 'AH0 F EH1 K T S', 'AE1 F EH0 K T S', 'V\\\n']
['AFFIX', 'AH0 F IH1 K S', 'AE1 F IH0 K S', 'V\\\n']
['AFFIXES', 'AH0 F IH1 K S IH0 Z', 'AE1 F IH0 K S IH0 Z', 'V\\\n']
['AGGLOMERATE', 'AH0 G L AA1 M ER0 EY2 T', 'AH0 G L AA1 M ER0 AH0 T', 'V\\

In [8]:
from nltk import pos_tag
from nltk.corpus import cmudict
import nltk
from nltk.tokenize import TweetTokenizer
word_tokenize = TweetTokenizer().tokenize
import numpy as np
import codecs
import re
import os
import unicodedata
from builtins import str as unicode

try:
    nltk.data.find('taggers/averaged_perceptron_tagger.zip')
except LookupError:
    nltk.download('averaged_perceptron_tagger')
try:
    nltk.data.find('corpora/cmudict.zip')
except LookupError:
    nltk.download('cmudict')



def construct_homograph_dictionary():
    homograph2features = dict()
    f = open("homograph.txt")
    for line in f.readlines()[10:]:
        #print(line)
        if line.startswith("#"): continue # comment
        headword, pron1, pron2, pos1 = line.split("|")[0], line.split("|")[1],line.split("|")[2],line.split("|")[3].replace("\\\n", "")
        homograph2features[headword.lower()] = (pron1.split(), pron2.split(), pos1)
    return homograph2features


class G2p(object):
    def __init__(self):
        super().__init__()
        self.graphemes = ["<pad>", "<unk>", "</s>"] + list("abcdefghijklmnopqrstuvwxyz")
        self.phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
                                                             'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
                                                             'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
                                                             'EY2', 'F', 'G', 'HH',
                                                             'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
                                                             'M', 'N', 'NG', 'OW0', 'OW1',
                                                             'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
                                                             'UH0', 'UH1', 'UH2', 'UW',
                                                             'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
        self.g2idx = {g: idx for idx, g in enumerate(self.graphemes)}
        self.idx2g = {idx: g for idx, g in enumerate(self.graphemes)}

        self.p2idx = {p: idx for idx, p in enumerate(self.phonemes)}
        self.idx2p = {idx: p for idx, p in enumerate(self.phonemes)}

        self.cmu = cmudict.dict()
        self.load_variables()
        self.homograph2features = construct_homograph_dictionary()

    def load_variables(self):
        self.variables = np.load("checkpoint20.npz")
        self.enc_emb = self.variables["enc_emb"]  # (29, 64). (len(graphemes), emb)
        self.enc_w_ih = self.variables["enc_w_ih"]  # (3*128, 64)
        self.enc_w_hh = self.variables["enc_w_hh"]  # (3*128, 128)
        self.enc_b_ih = self.variables["enc_b_ih"]  # (3*128,)
        self.enc_b_hh = self.variables["enc_b_hh"]  # (3*128,)

        self.dec_emb = self.variables["dec_emb"]  # (74, 64). (len(phonemes), emb)
        self.dec_w_ih = self.variables["dec_w_ih"]  # (3*128, 64)
        self.dec_w_hh = self.variables["dec_w_hh"]  # (3*128, 128)
        self.dec_b_ih = self.variables["dec_b_ih"]  # (3*128,)
        self.dec_b_hh = self.variables["dec_b_hh"]  # (3*128,)
        self.fc_w = self.variables["fc_w"]  # (74, 128)
        self.fc_b = self.variables["fc_b"]  # (74,)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x)) # mathematical logistic function // for the activation of the neural network

    def grucell(self, x, h, w_ih, w_hh, b_ih, b_hh):
        rzn_ih = np.matmul(x, w_ih.T) + b_ih # matrix_product of two array
        rzn_hh = np.matmul(h, w_hh.T) + b_hh

        rz_ih, n_ih = rzn_ih[:, :rzn_ih.shape[-1] * 2 // 3], rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:]
        rz_hh, n_hh = rzn_hh[:, :rzn_hh.shape[-1] * 2 // 3], rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:]

        rz = self.sigmoid(rz_ih + rz_hh)
        r, z = np.split(rz, 2, -1)

        n = np.tanh(n_ih + r * n_hh)
        h = (1 - z) * n + z * h

        return h

    def gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None):
        if h0 is None:
            h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32)
        h = h0  # initial hidden state
        outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32)
        for t in range(steps):
            h = self.grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh)  # (b, h)
            outputs[:, t, ::] = h
        return outputs

    def encode(self, word):
        chars = list(word) + ["</s>"]
        x = [self.g2idx.get(char, self.g2idx["<unk>"]) for char in chars]
        x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0)

        return x

    def predict(self, word):
        # encoder
        enc = self.encode(word)
        enc = self.gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh,
                       self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32))
        last_hidden = enc[:, -1, :]

        # decoder
        dec = np.take(self.dec_emb, [2], axis=0)  # 2: <s>
        h = last_hidden

        preds = []
        for i in range(20):
            h = self.grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh)  # (b, h)
            logits = np.matmul(h, self.fc_w.T) + self.fc_b
            pred = logits.argmax()
            if pred == 3: break  # 3: </s>
            preds.append(pred)
            dec = np.take(self.dec_emb, [pred], axis=0)

        preds = [self.idx2p.get(idx, "<unk>") for idx in preds]
        return preds

    def __call__(self, text):
        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        text = ''.join(char for char in unicodedata.normalize('NFD', text) #Convert the text into its decomposed form
                       if unicodedata.category(char) != 'Mn')  # Strip accents
        text = text.lower()
        text = re.sub("[^ a-z'.,?!\-]", "", text)
        text = text.replace("i.e.", "that is")
        text = text.replace("e.g.", "for example")

        # tokenization
        words = word_tokenize(text)
        tokens = pos_tag(words)  # tuples of (word, tag)

        # steps
        prons = []
        for word, pos in tokens:
            if re.search("[a-z]", word) is None:
                pron = [word]

            elif word in self.homograph2features:  # Check homograph
                pron1, pron2, pos1 = self.homograph2features[word]
                if pos.startswith(pos1):
                    pron = pron1
                else:
                    pron = pron2
            elif word in self.cmu:  # lookup CMU dict
                pron = self.cmu[word][0]
            else: # predict for oov
                pron = self.predict(word)

            prons.extend(pron)
            prons.extend([" "])

        return prons[:-1]

if __name__ == '__main__':
    texts = ["I have $250 in my pocket.", # number -> spell-out
             "popular pets, e.g. cats and dogs", # e.g. -> for example
             "I refuse to collect the refuse around here.", # homograph
             "I'm an activationist."] # newly coined word
    g2p = G2p()

In [9]:
G2P_names = []
G2P_names_test = []

for i in range(X_test.size):
    name = ''.join(g2p(X_test.tolist()[i]))
    G2P_names_test.append(name)

for i in range(X_train.size):
    name = ''.join(g2p(X_train.tolist()[i]))
    G2P_names.append(name)

In [146]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
# from aion.embeddings.doc2vec import Doc2VecEmbeddings

model = gensim.models.Word2Vec(G2P_names, min_count = 1,vector_size = 
                                50, window = 4, sg=0)

# def tagged_document(list_of_list_of_words):
#    for i, list_of_words in enumerate(list_of_list_of_words):
#       yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])
        
# data_for_training = list(tagged_document(G2P_names))
# print(data_for_training [:2])

# fname = get_tmpfile("my_doc2vec_model")

# model = gensim.models.doc2vec.Doc2Vec(vector_size=50,window=2,alpha = 0.025,epochs=50,min_count=1)
model.build_vocab(G2P_names)
model.train(G2P_names, total_examples=model.corpus_count, epochs=model.epochs)
# model.save(fname)

word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')


# def train_model(epochs, vector_size, window):
#     documents = [TaggedDocument(list(doc), [i]) for i, doc in enumerate(G2P_names)]
#     model = Doc2Vec(documents, epochs=epochs, vector_size=vector_size, window=window, workers=1)
#     return model

# doc2vec_embs = Doc2VecEmbeddings()
# x_train_tokens = doc2vec_embs.build_vocab(documents=G2P_names)
# doc2vec_embs.train(x_train_tokens)

# x_train_t = doc2vec_embs.encode(documents=G2P_names)
# x_test_t = doc2vec_embs.encode(documents=G2P_names_test)

# name_vector = []
# name_vector_test = []

# for i in range(len(G2P_names)):
#     vector = model2.docvecs[i]
#     name_vector.append(vector)
    
# for i in range(len(G2P_names_test)):
#     vector = model2.infer_vector([G2P_names_test[i]])
#     name_vector_test.append(vector)

In [151]:
# model = Doc2Vec.load(fname)
x_train = []
print(wv[60])
for i in range(X_train.shape[0]):
    vector = wv[i]
    x_train.append(vector)
# print(x_train[1])

KeyError: "Key '60' not present"

In [143]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

#using KNN to predict
neigh = KNeighborsClassifier(n_neighbors=31)

# pipeline = Pipeline([('knn', neigh)])
neigh.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=31)

In [144]:
from sklearn.metrics import accuracy_score
# predict gender of test names
x_test_t = []
# print(model.infer_vector([G2P_names_test[0]]))

for i in range(len(G2P_names_test)):
    vector = model.infer_vector([G2P_names_test[i]])
    x_test_t.append(vector)

y_pred = neigh.predict(x_test_t)
n = "".join(g2p('Xiao'))
name_vector2 = model.infer_vector([n])
print(neigh.predict([name_vector2])[0])

# y_pred = pipeline.predict([model.infer_vector([G2P_names_test[1]])])
print(y_pred)

# compute accuracy
accuracy_score(y_test, y_pred)

28
[28 28 28 ... 83 83 28]


0.014453477868112014