In [1]:
import numpy as np
import pandas as pd

from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
thai_characters = [ '', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'เ', 'แ', 'โ', 'ใ', 'ไ', '็', '่', '้', '๊', '๋', '์']

def get_name_indices(name):
    name_indices= []
    for c in name:
        try:
            i = thai_characters.index(c)
        except ValueError as e:
            i = 999

        name_indices.append(i)
        
    return name_indices

In [3]:
females = open('./data/female.txt', 'r').read().split("\n")
males = open('./data/male.txt', 'r').read().split("\n")

In [4]:
print('female: {} | male: {}'.format(len(females), len(males)))

female: 2946 | male: 1953


In [5]:
females = list(set(females))
males = list(set(males))
print('female: {} | male: {}'.format(len(females), len(males)))

female: 2759 | male: 1870


In [6]:
print('longest name chars female: {} | male: {}'.format(max(map(len, females)), max(map(len, males))))

longest name chars female: 12 | male: 13


In [7]:
X_female = list(map(get_name_indices, females))
X_male = list(map(get_name_indices, males))
Y_female = np.full(len(females), 0).tolist()
Y_male = np.full(len(males), 1).tolist()

In [8]:
X = X_female + X_male
Y = Y_female + Y_male

In [9]:
maxlen = 15
X = pad_sequences(X, padding='post', maxlen=maxlen, value=0)

In [10]:
X[0]

array([ 8, 54, 60, 35, 35, 47, 21, 25, 66,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [11]:
rus = RandomUnderSampler(random_state=420)
X_resampled, y_resampled = rus.fit_resample(X, Y)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=904)

In [299]:
model = Sequential()
model.add(Embedding(input_dim=len(thai_characters), 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(Dropout(0.25))

model.add(Conv1D(filters=256,
                 kernel_size=5,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=4))
model.add(Dropout(0.25))

model.add(Conv1D(filters=128,
                 kernel_size=2,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=1))
model.add(Dropout(0.25))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_118 (Embedding)    (None, 15, 200)           13400     
_________________________________________________________________
dropout_102 (Dropout)        (None, 15, 200)           0         
_________________________________________________________________
conv1d_157 (Conv1D)          (None, 11, 256)           256256    
_________________________________________________________________
max_pooling1d_115 (MaxPoolin (None, 2, 256)            0         
_________________________________________________________________
dropout_103 (Dropout)        (None, 2, 256)            0         
_________________________________________________________________
conv1d_158 (Conv1D)          (None, 1, 128)            65664     
_________________________________________________________________
max_pooling1d_116 (MaxPoolin (None, 1, 128)            0         
__________

In [42]:
model = Sequential()
model.add(Embedding(input_dim=len(thai_characters), 
                           output_dim=200, 
                           input_length=maxlen))
model.add(Dropout(0.3))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Flatten())

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 15, 200)           13400     
_________________________________________________________________
dropout_59 (Dropout)         (None, 15, 200)           0         
_________________________________________________________________
dense_59 (Dense)             (None, 15, 256)           51456     
_________________________________________________________________
dropout_60 (Dropout)         (None, 15, 256)           0         
_________________________________________________________________
dense_60 (Dense)             (None, 15, 256)           65792     
_________________________________________________________________
dropout_61 (Dropout)         (None, 15, 256)           0         
_________________________________________________________________
dense_61 (Dense)             (None, 15, 64)            16448     
__________

### Use this model

In [67]:
model = Sequential()
model.add(Embedding(input_dim=len(thai_characters), 
                           output_dim=200, 
                           input_length=maxlen))
model.add(Dropout(0.3))
model.add(LSTM(112, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, 15, 200)           13400     
_________________________________________________________________
dropout_71 (Dropout)         (None, 15, 200)           0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 15, 112)           140224    
_________________________________________________________________
dropout_72 (Dropout)         (None, 15, 112)           0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 32)                18560     
_________________________________________________________________
dense_69 (Dense)             (None, 1)                 33        
Total params: 172,217
Trainable params: 172,217
Non-trainable params: 0
_________________________________________________________________


In [68]:
checkpoint = ModelCheckpoint('model-{epoch:03d}-{acc:03f}-{val_acc:03f}-{val_loss:03f}.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')  
model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[checkpoint])

Train on 2393 samples, validate on 599 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.52647, saving model to model-001-0.594651-0.746244-0.526466.h5
Epoch 2/100

Epoch 00002: val_loss improved from 0.52647 to 0.46100, saving model to model-002-0.739239-0.786311-0.460995.h5
Epoch 3/100

Epoch 00003: val_loss improved from 0.46100 to 0.42695, saving model to model-003-0.768909-0.796327-0.426953.h5
Epoch 4/100

Epoch 00004: val_loss improved from 0.42695 to 0.41762, saving model to model-004-0.783953-0.813022-0.417625.h5
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.41762
Epoch 6/100

Epoch 00006: val_loss improved from 0.41762 to 0.40076, saving model to model-006-0.791057-0.826377-0.400756.h5
Epoch 7/100

Epoch 00007: val_loss improved from 0.40076 to 0.39608, saving model to model-007-0.798997-0.811352-0.396079.h5
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.39608
Epoch 9/100

Epoch 00009: val_loss improved from 0.39608 to 0.38504, saving mod


Epoch 00040: val_loss did not improve from 0.34807
Epoch 41/100

Epoch 00041: val_loss did not improve from 0.34807
Epoch 42/100

Epoch 00042: val_loss did not improve from 0.34807
Epoch 43/100

Epoch 00043: val_loss did not improve from 0.34807
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.34807
Epoch 45/100

Epoch 00045: val_loss did not improve from 0.34807
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.34807
Epoch 47/100

Epoch 00047: val_loss did not improve from 0.34807
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.34807
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.34807
Epoch 50/100

Epoch 00050: val_loss did not improve from 0.34807
Epoch 51/100

Epoch 00051: val_loss did not improve from 0.34807
Epoch 52/100

Epoch 00052: val_loss did not improve from 0.34807
Epoch 53/100

Epoch 00053: val_loss did not improve from 0.34807
Epoch 54/100

Epoch 00054: val_loss did not improve from 0.34807
Epoch 55/100

Epoch 00055: val_loss di


Epoch 00083: val_loss did not improve from 0.34807
Epoch 84/100

Epoch 00084: val_loss did not improve from 0.34807
Epoch 85/100

Epoch 00085: val_loss did not improve from 0.34807
Epoch 86/100

Epoch 00086: val_loss did not improve from 0.34807
Epoch 87/100

Epoch 00087: val_loss did not improve from 0.34807
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.34807
Epoch 89/100

KeyboardInterrupt: 

In [72]:
from keras.models import load_model
model = load_model('model-026-0.849979-0.851419-0.348067.h5')
model.evaluate(X_test, y_test)



[0.39844237913422403, 0.84625668481072]

In [75]:
names = ['ประยุทธ์', 'ประวิตร']
results = model.predict(pad_sequences(list(map(get_name_indices, names)), padding='post', maxlen=maxlen))
result_1d = results.reshape(results.shape[0])
result_names = np.around(result_1d, 10)

In [76]:
labels = ['Female', 'Male']
result_males = []
result_females = []
for i, name in enumerate(names):
    result_class = int(result_names[i] > 0.5)
    print('{}: {} | {:.3f}'.format(name, labels[result_class], result_names[i]))
    
    if result_class == 0:
        result_females.append(name)
    else:
        result_males.append(name)

ประยุทธ์: Male | 0.988
ประวิตร: Male | 0.796


In [77]:
model.save('model.h5')