# Prepare Data

In [73]:
from sklearn.utils import shuffle
import glob
import os
import re
import pprint
import pandas as pd
import unicodedata
import string


ALL_LETTERS = string.ascii_letters + " .,;'"
TRAIN_TEST_RATIO = 0.75
RANDOM_STATE = 10

def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn' and c in ALL_LETTERS
    )

def load_surnames():
    df_surnames = pd.DataFrame()
    list_ = []

    for filename in glob.glob('data/names/*.txt'):
        m = re.match(r'(.*)\/(.*?)\.txt', filename)
        category = m.group(2)
        df = pd.read_csv(filename,names=['surname'])
        df['category'] = category
        list_.append(df)
    df_surnames = pd.concat(list_)   
    df_surnames['normalized'] = df_surnames['surname'].apply(lambda x: unicode_to_ascii(x))
    
    series_categories = df_surnames.groupby(['category'])['category'].count()
    df_categories = pd.DataFrame({'category':series_categories.index, 'freq':series_categories.tolist(), 'index':range(0,len(series_categories))})
    
    return df_surnames, df_categories

def save_df_surnames_as_pickle():
    df_surnames, df_categories = load_surnames()
    # train test split
    df = shuffle(df_surnames, random_state=RANDOM_STATE)
    train_cnt = int(df['surname'].count()*TRAIN_TEST_RATIO)
    train = df[0:train_cnt]
    test = df[train_cnt+1:]
    # save as pickle
    df_surnames.to_pickle('data/pickles/df_surnames.pickle',compression='bz2')
    df_categories.to_pickle('data/pickles/df_categories.pickle',compression='bz2')
    train.to_pickle('data/pickles/train.pickle',compression='bz2')
    test.to_pickle('data/pickles/test.pickle',compression='bz2')
    # train test stat  
    t1 = train.groupby(['category']).count().drop(['normalized'],axis=1)
    t2 = test.groupby(['category']).count().drop(['normalized'],axis=1)
    t1.columns = ['surname_train']
    t2.columns = ['surname_test']
    tt = pd.DataFrame(pd.merge(t1, t2, left_index=True, right_index=True))
    tt['ratio'] = tt['surname_train'] / (tt['surname_train'] + tt['surname_test'])
    tt.to_pickle('data/pickles/train_test_stat.pickle',compression='bz2')
    return tt

In [74]:
train_test_stat = save_df_surnames_as_pickle()

# Check Train/Test Distribution

In [76]:
train_test_stat

Unnamed: 0_level_0,surname_train,surname_test,ratio
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Arabic,1478,521,0.73937
Chinese,198,70,0.738806
Czech,405,114,0.780347
Dutch,229,68,0.771044
English,2723,945,0.742366
French,221,56,0.797834
German,551,173,0.76105
Greek,148,55,0.729064
Irish,174,58,0.75
Italian,525,184,0.74048
