In [13]:
# -*- encoding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

In [14]:
from gensim import corpora,models,similarities,utils
import logging
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
from keras.preprocessing import text,sequence
import collections

In [15]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [16]:
# 获取训练数据
def getDataLen(inFile):
    # 统计所有出现的词
    word_ctr = collections.Counter()
    # 评论的最大长度
    maxlen = 0
    len_ctr = collections.Counter()
    
    # 读入训练数据           
    f=open(inFile)
    lines=f.readlines()
    for line in lines:
        article = line.replace('\n','').split(" ")

        # 内容
        content = article[1:]

        # 获得评论的最大长度
        if len(content) > maxlen:
            maxlen = len(content)

        # 统计各种长度的文章个数
        len_ctr[str(len(content))] += 1


    f.close()
        
    print('max_len ',maxlen)
    print ('len_ctr ', len_ctr)

In [17]:
def getLabel(x):
    if x == '__label__NEGATIVE':
        lable = '0'
    elif x== '__label__POSITIVE':
        lable = '1'
    else:
        print "x=",x
        lable = '0'
    return lable

In [18]:
# 获取训练数据
def getTrainSet(inFile,ptype):
    # 训练集
    train_set=[]
    title_set = []
    # 读入训练数据  
    f=open(inFile)
    lines=f.readlines()
    for line in lines:
        article = line.replace('\n','').split(" ")
        if ptype == 'train':
            title = getLabel(article[0])
        elif ptype== 'test':
            title = article[0]
        title_set.append(title)
        # 内容
        train_set.append(article[1:])

    f.close()
        
    return (title_set,train_set)

In [19]:
# 训练word2vec
def trainModel(inFile,modelFile,vecFile):
    # 读入数据    
    title_set,data_set = getTrainSet(inFile,'train')
    
    # 训练
    # 少于min_count次数的单词会被丢弃掉, 默认值为5
    # size = 神经网络的隐藏层的单元数 default value is 100
    # workers= 控制训练的并行:default = 1 worker (no parallelization) 只有在安装了Cython后才有效
    model = models.Word2Vec(data_set,min_count=5,window=10,size = 200,workers=4)
    
    # 存储模型
    model.save(modelFile)
    
    # 存储vector
    model.wv.save_word2vec_format(vecFile, binary=True) 


In [20]:
# 把原始文本转化为由词汇表索引表示的矩阵
def fastBuildSeq(inFile,outFile,modelFile,vecFile,ptype):
    # 读入数据
    title_set,data_set = getTrainSet(inFile,ptype)
    
    # 装载模型
    model = models.Word2Vec.load(modelFile)
    word_vec = model.wv.load_word2vec_format(vecFile, binary=True) 
    
    # 使用dir(object)查看对象的属性
    # 对每一个文章做转换      
    # 注意：由于word2vec的向量在训练的时候用的是unicode的编码，
    # 所以在字典里面匹配key的时候，需要把key转化为unicode的编码，使用decode('utf-8')
    transfrom = []
    for news in data_set:
        trs_news = [word_vec.vocab[w.decode('utf-8')].index for w in news if w.decode('utf-8') in word_vec.vocab]
#         # --- 调试
#         trs_news = []
#         for w in news:
#             if w.decode('utf-8') in word_vec.vocab:
#                 print "in vocab = ",w.decode('utf-8')
#                 trs_news.append((word_vec.vocab[w.decode('utf-8')].index,w))
#         # --
        transfrom.append(trs_news)
    
#     for x in transfrom:
#         print x
    
    # 对文字序列做补齐 ，补齐长度=最长的文章长度 ，补齐在最后，补齐用的词汇默认是词汇表index=0的词汇，也可通过value指定
    # 训练好的w2v词表的index = 0 对应的词汇是空格
    X = sequence.pad_sequences(transfrom,maxlen=300,padding='post')
    
    if ptype == 'train':
        y = np.array([int(i) for i in title_set])
        # 保存到文件
        np.save(outFile,np.column_stack([X,y]))
    elif ptype == 'test':
        np.save(outFile,X)

In [21]:
def data2Mat(inFile,modelFile,vecFile,partOut,totalOut,ptype):
    
    # 使用训练出的任意一个词向量，把全部train数据转化为向量矩阵
    # 把分词以后的文本转化为供CNN训练的数据矩阵
    # 由于原始数据较大，每10w分割为一个文件，分别转化
    for (tf,po) in zip(inFile,partOut):
        fastBuildSeq(tf,po,modelFile,vecFile,ptype)
    
    # 把转化完成的5个数据矩阵做合并
    mergeNpy(partOut,totalOut)

In [22]:
def mergeNpy(part,total):
    # 把转化完成的5个数据矩阵做合并
    for idx,f in enumerate(part):
        if idx == 0:
            tmp = np.load(f)
            mat = tmp
        else:
            tmp = np.load(f)
            mat = np.vstack([mat,tmp])
       
    np.save(total,mat)

In [23]:
def main():
    
    # 定义文件路径
    dataPath = "/home/hadoop/DataSencise/bdci2017/BDCI2017-360/data/"
    mdlPath = "/home/hadoop/DataSencise/bdci2017/BDCI2017-360/model/"
    
    # 训练数据
    inFile = [dataPath + "train/train_m"+ str(x) + ".txt" for x in range(1,6)]
    modelFile = [mdlPath + "w2v_m"+ str(x) + ".mdl" for x in range(1,6)]
    vecFile = [mdlPath + "w2v_m"+ str(x) + ".bin" for x in range(1,6)]
    trainPartMat = [dataPath + "train/train_m"+ str(x) + ".npy" for x in range(1,6)]
    trainTotalMat = dataPath + "train/train_totalMat.npy"
    
    # 测试数据
    testFile = [dataPath + "test/test_m"+ str(x) + ".txt" for x in range(1,6)]
    # 定义输出文件名
    testPartMat = [dataPath + "test/test_m"+ str(x) + ".npy" for x in range(1,6)]
    testTotalMat = dataPath + "test/test_totalMat.npy"
    
#     for f in testFile:
#         getDataLen(f)
        
    
    # 训练词向量模型
    # 把原始train数据，每10w条为一组，分别训练词向量
    # 一共训练出5个词向量模型
#     for (tf,mf,vf) in zip(inFile,modelFile,vecFile):
#         trainModel(tf,mf,vf)
    
#     # 把训练数据转成矩阵
#     data2Mat(inFile,modelFile[0],vecFile[0],trainPartMat,trainTotalMat,'train')

    # 把测试数据转成矩阵
    data2Mat(testFile,modelFile[0],vecFile[0],testPartMat,testTotalMat,'test')
    

In [24]:
if __name__ == '__main__':
    main()