## Name classification with Keras

In [26]:
import os
import re
import nltk
import numpy as np
import pickle

from tqdm import tqdm
from bs4 import BeautifulSoup
from random import shuffle
from collections import defaultdict
from nltk import ngrams

In [19]:
raw_data_file = "./data/countryResult.txt"
data_dir = "./data_processed"
dataset = open(raw_data_file).read().strip().split('\n')

In [20]:
print(len(dataset))
dataset[:5]

31595


['Belarus\tBeijing 2008\tsilver\t20.28\tAthletics\tNatallia MIKHNEVICH/',
 'Belarus\tVancouver 2010\tsilver\t48:32.0\tBiathlon\tSergey NOVIKOV/',
 'Belarus\tBeijing 2008\tsilver\t8551\tAthletics\tAndrei KRAUCHANKA/',
 'Belarus\tVancouver 2010\tgold\tFINAL\tFreestyle Skiing\tAlexei GRISHIN/',
 'Belarus\tBeijing 2008\tbronze\t81.51\tAthletics\tIvan TSIKHAN/']

In [21]:
remove_chars = [':', '©', '¶']
def clean_names(name):
    name_text = BeautifulSoup(name).get_text()
    name_text = re.sub("[^a-zA-Z\'.']"," ", name_text)
    name_text = re.sub(" +"," ",name_text)
    name_text = name_text.strip()
    clean_name = name_text.title()
    return clean_name

In [29]:
name2country = dict()
name2year = dict()
for line in tqdm(dataset):
    try:
        country, olympic_year, medal, record, sports, names_raw = line.split('\t')
        country = country.replace(',', ' ')
        country = country.strip()
    except ValueError as e:
        pass
    # In the olympics one has teams (i.e more than one individual per row)
    if len(names_raw.split('/')) >= 2:
        names = names_raw.split('/')
        names = [n for n in names if n!=""]
        for name in names:
            c_name = clean_names(name)
            if c_name in name2country:
                # and some athlete change countries. We keep the most recent one
                if country != name2country[c_name]:
                    previous_year = int(name2year[c_name].split(' ')[-1])
                    current_year  = int(olympic_year.split(' ')[-1])
                    if  previous_year <= current_year:
                        continue
                    else:
                        pass
            name2country[c_name] = country
            name2year[c_name] = olympic_year

100%|██████████| 31595/31595 [01:28<00:00, 355.90it/s]


In [36]:
len(name2country)

17715

In [37]:
name2country

{'Natallia Mikhnevich': 'Belarus',
 'Sergey Novikov': 'Belarus',
 'Andrei Krauchanka': 'Belarus',
 'Alexei Grishin': 'Belarus',
 'Ivan Tsikhan': 'Belarus',
 'Maryna Shkermankova': 'Belarus',
 'Vadim Devyatovskiy': 'Belarus',
 'Iryna Kulesha': 'Belarus',
 'Aksana Miankova': 'Belarus',
 'Darya Domracheva': 'Belarus',
 'Fernanda Ribeiro': 'Portugal',
 'Rui Silva': 'Portugal',
 'Nelson Evora': 'Portugal',
 'Rosa Mota': 'Portugal',
 'Jose Manuel Gentil Quina': 'Portugal',
 'Mario Gentil Quina': 'Portugal',
 'Armando Da Silva Marques': 'Portugal',
 'Fernando Silva Paes': 'Portugal',
 'Francisco Valadas': 'Portugal',
 'Luiz Silva': 'Portugal',
 'Sergio Paulinho': 'Portugal',
 'Nuno Barreto': 'Portugal',
 'Victor Hugo Rocha': 'Portugal',
 'Emanuel Silva': 'Portugal',
 'Fernando Pimenta': 'Portugal',
 'Domingos De Sousa Coutinho Marques Do Funchal': 'Portugal',
 'Jose Beltrao': 'Portugal',
 'Carlos Lopes': 'Portugal',
 'Francis Obikwelu': 'Portugal',
 "Duarte M.D'Almeida Bello": 'Portugal',
 'F

In [33]:
pickle.dump(name2country, open(os.path.join(data_dir, 'name2country.p'), 'wb'))
country2idx = [(cntr,i) for i,cntr in enumerate(sorted(set(name2country.values()))) ]
pickle.dump(dict(country2idx), open(os.path.join(data_dir,'country2idx.p'), 'wb'))

In [34]:
def get_ngram(corpus, n):
    n_grams = set()
    for strg in corpus:
        ngram_gen = ngrams(strg,n)
        for n_gram in ngram_gen:
            n_grams.add("".join(n_gram))
    return list(n_grams)

In [39]:
all_names = name2country.keys()
unigrams = sorted(list(set(" ".join(all_names))))
bigrams  = sorted(get_ngram(all_names, 2))
trigrams = sorted(get_ngram(all_names, 3))
unigram2idx = [(ng, i) for i,ng in enumerate(unigrams)]
bigram2idx  = [(ng, i) for i,ng in enumerate(bigrams)]
trigram2idx = [(ng, i) for i,ng in enumerate(trigrams)]
print(unigram2idx)

[(' ', 0), ("'", 1), ('.', 2), ('A', 3), ('B', 4), ('C', 5), ('D', 6), ('E', 7), ('F', 8), ('G', 9), ('H', 10), ('I', 11), ('J', 12), ('K', 13), ('L', 14), ('M', 15), ('N', 16), ('O', 17), ('P', 18), ('Q', 19), ('R', 20), ('S', 21), ('T', 22), ('U', 23), ('V', 24), ('W', 25), ('X', 26), ('Y', 27), ('Z', 28), ('a', 29), ('b', 30), ('c', 31), ('d', 32), ('e', 33), ('f', 34), ('g', 35), ('h', 36), ('i', 37), ('j', 38), ('k', 39), ('l', 40), ('m', 41), ('n', 42), ('o', 43), ('p', 44), ('q', 45), ('r', 46), ('s', 47), ('t', 48), ('u', 49), ('v', 50), ('w', 51), ('x', 52), ('y', 53), ('z', 54)]


In [40]:
pickle.dump(dict(unigram2idx), open(os.path.join(data_dir,'unigram2idx.p'), 'wb'))
pickle.dump(dict(bigram2idx), open(os.path.join(data_dir,'bigram2idx.p'), 'wb'))
pickle.dump(dict(trigram2idx), open(os.path.join(data_dir,'trigram2idx.p'), 'wb'))