# 基于名字的性别判断
## 概述
性别是人类差异最大的特征之一，不同的性别拥有不同的特征，譬如购物、电视剧、书籍等方面男生和女生的爱好有很大的不同。因此，知道了用户的性别就可以更加准确的判断用户的潜在行为和需求。由此可知，性别识别的重要性和价值性不言而喻，每个机器学习模型的构建，基本都会需要准确识别用户的性别。

　　目前业内预测用户性别的方法有很多，大多数都是基于用户的行为数据、兴趣等方面进行性别判定，识别的准确性也参差不齐。但是，很多的时候我们拿不到用户的行为数据，这个时候用用户的行为数据、兴趣数据建立机器学习模型就显得力不从心了。同时，从用户的行为数据着手建立模型去预测用户的性别效果也并不会见得有多好，因为影响模型准确性的主要原因是这些用户的行为在性别上区分度有多大，如果区分度不明显，那模型和算法的准确性将会遇到明显的瓶颈。同时，基于用户行为的性别识别涉及的数据面非常广、数据依赖链条很长、数据计算复杂度很高，识别效能反而成为了痛点！


## 原理
贝叶斯公式: P(Y|X) = P(X|Y) * P(Y) / P(X)

当X条件独立时, P(X|Y) = P(X1|Y) * P(X2|Y) * ...

应用到猜名字上

P(gender=男|name=本山) 
= P(name=本山|gender=男) * P(gender=男) / P(name=本山)
= P(name has 本|gender=男) * P(name has 山|gender=男) * P(gender=男) / P(name=本山)

In [4]:
import tensorflow as tf
import numpy as np
import ngender
name_dataset = 'name.csv'
tf.reset_default_graph() 
train_x = []
train_y = []
with open(name_dataset, 'rt') as f:
    first_line = True
    for line in f:
        if first_line is True:
            first_line = False
            continue
       # uline = str(line,encoding="utf-8")
        sample = line.strip().split(',')
        if len(sample) == 2:
            train_x.append(sample[0])
            if sample[1] == '男':
                train_y.append([0, 1])  # 男
            else:
                train_y.append([1, 0])  # 女
 
max_name_length = max([len(name) for name in train_x])
print("最长名字的字符数: ", max_name_length)
max_name_length = 8
 
# 数据已shuffle
#shuffle_indices = np.random.permutation(np.arange(len(train_y)))
#train_x = train_x[shuffle_indices]
#train_y = train_y[shuffle_indices]
 
# 词汇表（参看聊天机器人练习）
counter = 0
vocabulary = {}
for name in train_x:
    counter += 1
    tokens = [word for word in name]
    for word in tokens:
        if word in vocabulary:
            vocabulary[word] += 1
        else:
            vocabulary[word] = 1
 
vocabulary_list = [' '] + sorted(vocabulary, key=vocabulary.get, reverse=True)
print(len(vocabulary_list))
 
# 字符串转为向量形式
vocab = dict([(x, y) for (y, x) in enumerate(vocabulary_list)])
train_x_vec = []
for name in train_x:
    name_vec = []
    for word in name:
        name_vec.append(vocab.get(word))
    while len(name_vec) < max_name_length:
        name_vec.append(0)
    train_x_vec.append(name_vec)
 
#######################################################
 
input_size = max_name_length
num_classes = 2
 
batch_size = 64
num_batch = len(train_x_vec) // batch_size
 
X = tf.placeholder(tf.int32, [None, input_size])
Y = tf.placeholder(tf.float32, [None, num_classes])
 
dropout_keep_prob = tf.placeholder(tf.float32)
 
def neural_network(vocabulary_size, embedding_size=128, num_filters=128):
    # embedding layer
    with tf.device('/cpu:0'), tf.name_scope("embedding"):
        W = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embedded_chars = tf.nn.embedding_lookup(W, X)
        embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
    # convolution + maxpool layer
    filter_sizes = [3,4,5]
    pooled_outputs = []
    for i, filter_size in enumerate(filter_sizes):
        with tf.name_scope("conv-maxpool-%s" % filter_size):
            filter_shape = [filter_size, embedding_size, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]))
            conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID")
            h = tf.nn.relu(tf.nn.bias_add(conv, b))
            pooled = tf.nn.max_pool(h, ksize=[1, input_size - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
            pooled_outputs.append(pooled)
 
    num_filters_total = num_filters * len(filter_sizes)
    h_pool = tf.concat(pooled_outputs, 3)
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
    # dropout
    with tf.name_scope("dropout"):
        h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)
    # output
    with tf.name_scope("output"):
        W = tf.get_variable("W", shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer())
        b = tf.Variable(tf.constant(0.1, shape=[num_classes]))
        output = tf.nn.xw_plus_b(h_drop, W, b)

    return output
# 训练
def train_neural_network():
    output = neural_network(len(vocabulary_list))
    optimizer = tf.train.AdamOptimizer(1e-3)
    loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=output, lables=Y))
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars)
 
    saver = tf.train.Saver(tf.global_variables())
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
 
        for e in range(201):
            for i in range(num_batch):
                batch_x = train_x_vec[i*batch_size : (i+1)*batch_size]
                batch_y = train_y[i*batch_size : (i+1)*batch_size]
                _, loss_ = sess.run([train_op, loss], feed_dict={X:batch_x, Y:batch_y, dropout_keep_prob:0.5})
                print(e, i, loss_)
            # 保存模型
            if e % 50 == 0:
                saver.save(sess, "name2sex.model", global_step=e)
 
##
##train_neural_network()
 
# 使用训练的模型
def detect_sex(name_list):
    x = []
    for name in name_list:
        name_vec = []
        for word in name:
            name_vec.append(vocab.get(word))
        while len(name_vec) < max_name_length:
            name_vec.append(0)
        x.append(name_vec)
 
    output = neural_network(len(vocabulary_list))
 
    saver = tf.train.Saver(tf.global_variables())
    with tf.Session() as sess:
        # 恢复前一次训练
        ckpt = tf.train.get_checkpoint_state('.')
        if ckpt != None:
            print(ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        #else:
            #print("没找到模型")
 
        predictions = tf.argmax(output, 1)
        res = sess.run(predictions, {X:x, dropout_keep_prob:1.0})
 
        i = 0
        for name in name_list:
            print(name, '女' if res[i] == 0 else '男')
            i += 1
gue = ["白富美","高帅富","王婷婷","田野"]
for x in gue:
    print(ngender.guess(x))
#detect_sex(["白富美", "高帅富", "王婷婷", "田野"])

最长名字的字符数:  3
1306
('male', 0.5591018119262235)
('male', 0.9800590316057012)
('female', 0.9968617145628665)
('male', 0.8495845552297164)
