In [160]:
import keras
from keras.layers import Input, Dropout, Dense
from keras.models import Model, Sequential, load_model
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPool2D
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import random
import numpy as np
import pandas as pd
from sklearn import metrics
from pandas import DataFrame, Series
import pickle

In [150]:
def randomShuffle(X, Y):
    idx = [t for t in range(X.shape[0])]
    random.shuffle(idx)
    X = X[idx]
    Y = Y[idx]
    print()
    print('-' * 36)
    print('dimension of X after synthesis:', X.shape)
    print('dimension of Y after synthesis', Y.shape)
    print('label after shuffle:', '\n', DataFrame(Y).head())
    print('-' * 36)
    return X, Y

def aucJ(true_labels, predictions):
    
    fpr, tpr, thresholds = metrics.roc_curve(true_labels, predictions, pos_label=1)
    auc = metrics.auc(fpr,tpr)

    return auc

def acc(true, pred):
    
    return np.sum(true == pred) * 1.0 / len(true)


def assess(model, X, label, thre = 0.5):
    
    threshold = thre
    
    pred = model.predict(X)
    pred = pred.flatten()
    
    pred[pred > threshold] = 1
    pred[pred <= threshold] = 0
    
    auc = aucJ(label, pred)
    accuracy = acc(label, pred)
    
    print('auc: ', auc)
    print('accuracy: ', accuracy)
    
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

## load_data and preprocess

In [110]:
names = pd.read_csv('names.csv', header = None, encoding='gbk')
non_names = pd.read_csv('non_names.csv', header = None, encoding='gbk')

pos_label= np.ones(len(names))
neg_label= np.zeros(len(non_names))

data = np.array(pd.concat((names, non_names)))
label = np.hstack((pos_label, neg_label))

data, label = randomShuffle(data, label)


------------------------------------
dimension of X after synthesis: (1213954, 1)
dimension of Y after synthesis (1213954,)
label after shuffle: 
      0
0  1.0
1  1.0
2  1.0
3  1.0
4  1.0
------------------------------------


In [111]:
corpus = []
for i in range(data.shape[0]):
    corpus.extend(list(data[i][0]))
    
str2num = LabelEncoder().fit(corpus)
corpus = str2num.transform(corpus)
num2one = OneHotEncoder().fit(corpus.reshape(-1, 1))

In [112]:
Max_length = Series(data.flatten()).str.len().max()

In [113]:
data_t = Series(data.flatten()).str.ljust(Max_length, fillchar='|')

In [114]:
chars = sorted(list(set(''.join(data_t.values.flatten()))))

In [115]:
char_idx = dict((c, i) for i, c in enumerate(chars))
idx_char = dict((i, c) for i, c in enumerate(chars))

In [116]:
def dimX(X, ts, char_idx, chars):
    
    temp = np.zeros((len(X), ts, len(chars)), dtype=np.bool)
    for i, c in enumerate(X):
        for j, s in enumerate(c):
            # print i, j, s
            temp[i, j, char_idx[s]] = 1
            
    return np.array(temp)

In [117]:
X = dimX(data_t, Max_length, char_idx, chars)
Y = label

In [118]:
train_X = X[:1000000, :, :, np.newaxis]
valid_X = X[1000000:, :, :, np.newaxis]
train_Y = Y[:1000000]
valid_Y = Y[1000000:]
print(train_X.shape, train_Y.shape)
print(valid_X.shape, valid_Y.shape)

(1000000, 4, 5201, 1) (1000000,)
(213954, 4, 5201, 1) (213954,)


## CNN

In [119]:
drop_out = 0.1
model1 = Sequential()

model1.add(Conv2D(filters = 32, kernel_size = 5,padding = 'Same', 
                 activation ='relu', input_shape = (4, 5201, 1)))
model1.add(Conv2D(filters = 32, kernel_size = 5,padding = 'Same', 
                 activation ='relu'))
model1.add(MaxPool2D(pool_size=2, strides = 2, padding= 'Same'))

if drop_out != 0:
    model1.add(Dropout(drop_out))


model1.add(Conv2D(filters = 64, kernel_size = 3,padding = 'Same', 
                 activation ='relu'))
model1.add(Conv2D(filters = 64, kernel_size = 3,padding = 'Same', 
                 activation ='relu'))
model1.add(MaxPool2D(pool_size= 2, strides=2, padding = 'Same'))

if drop_out != 0:
    model1.add(Dropout(drop_out))


model1.add(Flatten())
model1.add(Dense(256, activation = "relu"))

if drop_out != 0:
    model1.add(Dropout(drop_out))
    
model1.add(Dense(1, activation = "sigmoid"))

In [121]:
model1.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [126]:
model1.fit(train_X, train_Y, batch_size = 256, epochs = 100
          , verbose = 2 
          , validation_data = (valid_X, valid_Y))

Train on 1000000 samples, validate on 213954 samples
Epoch 1/100


KeyboardInterrupt: 

## only use 50,000 name samples

In [130]:
pos_sam_num = 50000

names = pd.read_csv('names.csv', header = None, encoding='gbk')
pos_label= np.ones(len(names))

names, pos_label = randomShuffle(np.array(names), pos_label)
names = names[:pos_sam_num, :]
pos_label = pos_label[:pos_sam_num]

non_names = pd.read_csv('non_names.csv', header = None, encoding='gbk')
neg_label= np.zeros(len(non_names))

data = np.array(pd.concat((DataFrame(names), non_names)))
label = np.hstack((pos_label, neg_label))

data, label = randomShuffle(data, label)

print(data.shape, label.shape)


------------------------------------
dimension of X after synthesis: (1181270, 1)
dimension of Y after synthesis (1181270,)
label after shuffle: 
      0
0  1.0
1  1.0
2  1.0
3  1.0
4  1.0
------------------------------------

------------------------------------
dimension of X after synthesis: (82684, 1)
dimension of Y after synthesis (82684,)
label after shuffle: 
      0
0  1.0
1  1.0
2  0.0
3  1.0
4  1.0
------------------------------------
(82684, 1) (82684,)


In [131]:
Max_length = Series(data.flatten()).str.len().max()

data_t = Series(data.flatten()).str.ljust(Max_length, fillchar='|')

chars = sorted(list(set(''.join(data_t.values.flatten()))))

In [132]:
char_idx = dict((c, i) for i, c in enumerate(chars))
idx_char = dict((i, c) for i, c in enumerate(chars))

In [154]:
save_obj(char_idx, 'model2_c2i')

In [155]:
t = load_obj('model2_c2i')

In [157]:
t['廖']

1263

In [133]:
def dimX(X, ts, char_idx, chars):
    
    temp = np.zeros((len(X), ts, len(chars)), dtype=np.bool)
    for i, c in enumerate(X):
        for j, s in enumerate(c):
            # print i, j, s
            temp[i, j, char_idx[s]] = 1
            
    return np.array(temp)

In [134]:
X = dimX(data_t, Max_length, char_idx, chars)
Y = label

train_X = X[:70000, :, :, np.newaxis]
valid_X = X[70000:, :, :, np.newaxis]
train_Y = Y[:70000]
valid_Y = Y[70000:]
print(train_X.shape, train_Y.shape)
print(valid_X.shape, valid_Y.shape)

(70000, 4, 4726, 1) (70000,)
(12684, 4, 4726, 1) (12684,)


In [140]:
drop_out = 0.1
model2 = Sequential()

model2.add(Conv2D(filters = 32, kernel_size = 5,padding = 'Same', 
                 activation ='relu', input_shape = (4, 4726, 1)))
model2.add(Conv2D(filters = 32, kernel_size = 5,padding = 'Same', 
                 activation ='relu'))
model2.add(MaxPool2D(pool_size=2, strides = 2, padding= 'Same'))

if drop_out != 0:
    model2.add(Dropout(drop_out))


model2.add(Conv2D(filters = 64, kernel_size = 3,padding = 'Same', 
                 activation ='relu'))
model2.add(Conv2D(filters = 64, kernel_size = 3,padding = 'Same', 
                 activation ='relu'))
model2.add(MaxPool2D(pool_size= 2, strides=2, padding = 'Same'))

if drop_out != 0:
    model1.add(Dropout(drop_out))


model2.add(Flatten())
model2.add(Dense(256, activation = "relu"))

if drop_out != 0:
    model2.add(Dropout(drop_out))
    
model2.add(Dense(1, activation = "sigmoid"))

In [142]:
model2.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [143]:
model2.fit(train_X, train_Y, batch_size = 512, epochs = 20
          , verbose = 2 
          , validation_data = (valid_X, valid_Y))

Train on 70000 samples, validate on 12684 samples
Epoch 1/20
 - 60s - loss: 0.1480 - acc: 0.9415 - val_loss: 0.0910 - val_acc: 0.9652
Epoch 2/20
 - 55s - loss: 0.0695 - acc: 0.9740 - val_loss: 0.0788 - val_acc: 0.9713
Epoch 3/20
 - 55s - loss: 0.0449 - acc: 0.9835 - val_loss: 0.0735 - val_acc: 0.9740
Epoch 4/20
 - 55s - loss: 0.0288 - acc: 0.9893 - val_loss: 0.0804 - val_acc: 0.9743
Epoch 5/20
 - 55s - loss: 0.0184 - acc: 0.9936 - val_loss: 0.0894 - val_acc: 0.9748
Epoch 6/20
 - 55s - loss: 0.0126 - acc: 0.9956 - val_loss: 0.0939 - val_acc: 0.9751
Epoch 7/20
 - 55s - loss: 0.0099 - acc: 0.9969 - val_loss: 0.0945 - val_acc: 0.9743
Epoch 8/20
 - 55s - loss: 0.0082 - acc: 0.9975 - val_loss: 0.1019 - val_acc: 0.9751
Epoch 9/20
 - 55s - loss: 0.0070 - acc: 0.9979 - val_loss: 0.1114 - val_acc: 0.9737
Epoch 10/20
 - 55s - loss: 0.0067 - acc: 0.9978 - val_loss: 0.1034 - val_acc: 0.9761
Epoch 11/20
 - 55s - loss: 0.0064 - acc: 0.9980 - val_loss: 0.1104 - val_acc: 0.9747
Epoch 12/20
 - 55s - los

<keras.callbacks.History at 0x7fb5ed773128>

In [152]:
model2.save('CNN_v2.h5')

In [151]:
assess(model2, train_X, train_Y)
assess(model2, valid_X, valid_Y)

auc:  0.9986408313637744
accuracy:  0.9987142857142857
auc:  0.9735193137436282
accuracy:  0.9758751182592242


In [161]:
t = load_model('CNN_v2.h5')

In [162]:
assess(t, train_X, train_Y)
assess(t, valid_X, valid_Y)

auc:  0.9986408313637744
accuracy:  0.9987142857142857
auc:  0.9735193137436282
accuracy:  0.9758751182592242
