In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

In [2]:
nations = ('china', 'japan')

In [3]:
chn = pd.read_csv('china.csv')
chn.names = chn.names.apply(lambda x: x.strip())
chn['nation'] = 0
chn.head(10)

Unnamed: 0,names,nation
0,Wang,0
1,Li,0
2,Zhang,0
3,Liu,0
4,Chen,0
5,Yang,0
6,Huang,0
7,Zhao,0
8,Wu,0
9,Zhou,0


In [4]:
jpn = pd.read_csv('japan.csv')
jpn.names = jpn.names.apply(lambda x: x.strip())
jpn['nation'] = 1
jpn.head(10)

Unnamed: 0,names,nation
0,Sato,1
1,Suzuki,1
2,Takahashi,1
3,Tanaka,1
4,Watanabe,1
5,Ito,1
6,Nakamura,1
7,Kobayashi,1
8,Yamamoto,1
9,Kato,1


In [5]:
comb_df = pd.concat([chn, jpn], axis=0, ignore_index=True)
comb_df = comb_df.reindex(np.random.permutation(comb_df.index))
comb_df.head(10)

Unnamed: 0,names,nation
475,Megu,1
364,Chiharu,1
505,Miwako,1
258,Ota,1
629,Yuka,1
46,Lu,0
114,Cai,0
361,Chie,1
288,Miyasaki,1
530,Naoko,1


In [6]:
X_train, X_test, \
    y_train, y_test = train_test_split(comb_df.names, comb_df.nation, test_size=0.3, random_state=0)

In [7]:
class LengthExtractor(BaseEstimator, TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.DataFrame([len(x) for x in X])

    def fit(self, X, y=None, **fit_params):
        return self

In [8]:
class EndsWithConsonant(BaseEstimator, TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.DataFrame(X).applymap(lambda x: int(x[-1] not in ['a', 'i', 'u', 'e', 'o']))
    
    def fit(self, X, y=None, **fit_params):
        return self

In [9]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('vect', CountVectorizer(analyzer='char')),
        ('namelen', LengthExtractor()),
        ('endswithcons', EndsWithConsonant())
    ])),
    ('clf', LogisticRegression()),
])

parameters = {
    'features__vect__ngram_range': ((1, 2), (1, 3), (1, 4), (1, 5)),
    'clf__C': [1, 5, 10, 20, 50, 70, 90, 100]
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_ra...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'features__vect__ngram_range': ((1, 2), (1, 3), (1, 4), (1, 5)), 'clf__C': [1, 5, 10, 20, 50, 70, 90, 100]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [10]:
grid_search.best_score_, grid_search.best_params_

(0.96017699115044253, {'clf__C': 1, 'features__vect__ngram_range': (1, 2)})

In [11]:
test_accuracy = np.mean(grid_search.predict(X_test) == y_test)
test_samples = len(y_test)
print('test accuracy: {:.1f}%'.format(test_accuracy * 100))
print('number of test samples: {}'.format(test_samples))

test accuracy: 98.5%
number of test samples: 194


In [12]:
dict(zip(X_test[:20], [nations[x] for x in grid_search.predict(X_test[:20])]))

{'Ayako': 'japan',
 'Fang': 'china',
 'Fujii': 'japan',
 'Harada': 'japan',
 'Ikue': 'japan',
 'Kazuko': 'japan',
 'Kikuko': 'japan',
 'Kimiko': 'japan',
 'Lee': 'china',
 'Mika': 'japan',
 'Motoko': 'japan',
 'Nishimura': 'japan',
 'Qiang': 'china',
 'Qin': 'china',
 'Rumi': 'japan',
 'Takada': 'japan',
 'Takata': 'japan',
 'Wen': 'china',
 'Yukie': 'japan',
 'Yurika': 'japan'}

In [13]:
final = Pipeline([
    ('features', FeatureUnion([
        ('vect', CountVectorizer(analyzer='char', ngram_range=(1, 3))),
        ('namelen', LengthExtractor()),
        ('endswithcons', EndsWithConsonant())
    ])),
    ('clf', LogisticRegression(C=50)),
])

final.fit(comb_df.names, comb_df.nation)

Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_ra...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [14]:
# corner cases: inui, nam, ken
nations[final.predict(['Nam'])]

  from ipykernel import kernelapp as app


'china'

In [15]:
import pickle

with open('clf.pkl', 'wb+') as f:
    pickle.dump(final, f, 2)

In [16]:
with open('clf.pkl', 'rb') as f:
    clf = pickle.load(f)
nations[clf.predict(['Inui'])]

  app.launch_new_instance()


'japan'