In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
import numpy as np;
import os
import jieba
import gensim.models.word2vec as w2v
from sklearn.model_selection import train_test_split



# 基本方法

In [3]:
def is_chinese(uchar):
    """判断一个unicode是否是汉字"""
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False
def is_number(uchar):
    """判断一个unicode是否是数字"""
    if uchar >= u'\u0030' and uchar <= u'\u0039':
        return True
    else:
        return False
def is_alphabet(uchar):
    """判断一个unicode是否是英文字母"""
    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
        return True
    else:
        return False
def is_legal(uchar):
    """判断是否非汉字，数字和英文字符"""
    if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
        return False
    else:
        return True
def extract_chinese(line):
    res = ""
    for word in line:
        if is_legal(word):
            res = res + word
    return res;
def words2line(words):
    line = ""
    for word in words:
        line = line + " " + word
    return line


# 数据预处

In [6]:
#数据预处理函数，在dir文件夹下每个子文件是一类内容
def datahelper(dir):
#返回为文本，文本对应标签
    labels_index={}
    index_lables={}
    num_recs=0
    fs = os.listdir(dir)
    MAX_SEQUENCE_LENGTH = 200
    MAX_NB_WORDS = 50000
    EMBEDDING_DIM = 20
    VALIDATION_SPLIT = 0.2
    i = 0;
    for f in fs:
        labels_index[f] = i;
        index_lables[i] = f
        i = i + 1;
    print(labels_index)
    texts = []
    labels = []  # list of label ids
    for la in labels_index.keys():
        print(la + " " + index_lables[labels_index[la]])
        la_dir = dir + "/" + la;
        fs = os.listdir(la_dir)
        for f in fs:
            file = open(la_dir + "/" + f, encoding='utf-8')
            lines = file.readlines();
            text = ''
            for line in lines:
                if len(line) > 5:
                    line = extract_chinese(line)
                    words = jieba.lcut(line, cut_all=False, HMM=True)
                    text = words
                    texts.append(text)
                    labels.append(labels_index[la])
                    num_recs = num_recs + 1
    return texts,labels,labels_index,index_lables

#load word 2 vetc，加载词向量，可以事先预训练
def getw2v():
    model_file_name = 'D:/PROJECT_TW/git/data/nlp/w2v/new_model_big.txt'
    
    # 模型训练，生成词向量
    '''
    sentences = w2v.LineSentence('trainword.txt')
    model = w2v.Word2Vec(sentences, size=20, window=5, min_count=5, workers=4)
    model.save(model_file_name)
    '''
    model = w2v.Word2Vec.load(model_file_name)
    return model;


def trainw2v():
    model_file_name = 'D:/PROJECT_TW/git/data/nlp/w2v/new_model_big.txt'
    sentences = w2v.LineSentence('D:/PROJECT_TW/git/data/nlp/THUCNews/trainword.txt')
    model = w2v.Word2Vec(sentences, size=20, window=5, min_count=5, workers=1)
    model.save(model_file_name)



In [104]:
# 生成词向量训练数据
train_dir = 'D:/PROJECT_TW/git/data/nlp/THUCNews'
texts,labels,labels_index,index_lables=datahelper(train_dir)
big_txt = ''
for item in texts:
    big_txt = '{}\n{}'.format(big_txt,' '.join(item))
big_txt = big_txt.encode('utf-8')

with open('D:/PROJECT_TW/git/data/nlp/THUCNews/trainword.txt','wb') as f:
    f.write(big_txt)

{'家居': 0, '彩票': 1, '房产': 2, '教育': 3, '股票': 4, '财经': 5}
家居 家居
彩票 彩票
房产 房产
教育 教育
股票 股票
财经 财经


In [105]:
# 训练词向量
trainw2v()

In [106]:
model = getw2v()
print(model.wv.get_vector('上海'))


[ 0.23999122  0.04855075 -0.92600787  0.10056638 -0.24005225  0.22864884
 -1.2042845  -0.18332459  0.80257744  0.6747311   0.13293691 -0.15797134
  0.0390049   0.6759941  -0.8183674  -0.3172709   0.7973181   0.26182362
 -0.0844366   0.67150074]


# 数据模型

In [107]:
DEBUG = False
#textCNN模型
class textCNN(nn.Module):
    def __init__(self,args):
        super(textCNN, self).__init__()
        if DEBUG:
            print('args --> {}'.format(args))
        vocb_size = args['vocb_size']
        dim = args['dim']
        n_class = args['n_class']
        max_len = args['max_len']
        embedding_matrix=args['embedding_matrix']
        #需要将事先训练好的词向量载入  dim  20
        self.embeding = nn.Embedding(vocb_size, dim,_weight=embedding_matrix)
        # h 表示每句话分隔后的单词， w 表示该单词的向量
        # conv1 input size: batch number , channel,  h , w  . (1000, 1, 64, 20) --> (1000, 16, 32, 10)
        self.conv1 = nn.Sequential(
                     nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5,
                               stride=1, padding=2),

                     nn.ReLU(),
                     nn.MaxPool2d(kernel_size=2) # 
                     )
        # conv1 input size: batch number , channel,  h , w  . (1000, 16, 32, 10) --> (1000, 32, 16, 5)
        self.conv2 = nn.Sequential(
                     nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2),
                     nn.ReLU(),
                     nn.MaxPool2d(2)
                     )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        # conv4 input size: batch number , channel,  h , w  . (1000, 64, 8, 2) --> (1000, 128, 4, 1)
        self.conv4 = nn.Sequential(  # (16,64,64)
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.out = nn.Linear(512, n_class)

    def forward(self, x):
        x = self.embeding(x)
        x=x.view(x.size(0),1,max_len,word_dim)
        if DEBUG:
            print('input size --> {}'.format(x.size()))
        x = self.conv1(x)
        if DEBUG:
            print('conv1 size --> {}'.format(x.size()))
        x = self.conv2(x)
        if DEBUG:
            print('conv2 size --> {}'.format(x.size()))        
        x = self.conv3(x)
        if DEBUG:
            print('conv3 size --> {}'.format(x.size()))        
        x = self.conv4(x)
        if DEBUG:
            print('conv4 size --> {}'.format(x.size()))        
        x = x.view(x.size(0), -1) # 将（batch，outchanel,w,h）展平为（batch，outchanel*w*h）
        #print(x.size())
        output = self.out(x)
        return output

# 模型训练

In [109]:
# 准备数据


# train_dir = 'D:/PROJECT_TW/git/data/nlp/THUCNews'
# texts,labels,labels_index,index_lables=datahelper(train_dir)
#词表
word_vocb=[]
word_vocb.append('')
for text in texts:
    for word in text:
        word_vocb.append(word)
word_vocb=set(word_vocb)
vocb_size=len(word_vocb)
#设置词表大小
nb_words=40000
max_len=64
word_dim=20
n_class=len(index_lables)

args={}
if nb_words<vocb_size:
    nb_words=vocb_size;

#textCNN调用的参数
args['vocb_size']=nb_words
args['max_len']=max_len
args['n_class']=n_class
args['dim']=word_dim



texts_with_id=np.zeros([len(texts),max_len])

#词表与索引的map
word_to_idx={word:i for i,word in enumerate(word_vocb)}
idx_to_word={word_to_idx[word]:word for word in word_to_idx}
#每个单词的对应的词向量
embeddings_index = getw2v()
#预先处理好的词向量
embedding_matrix = np.zeros((nb_words, word_dim))

for word, i in word_to_idx.items():
    if i >= nb_words:
        continue
    if  embeddings_index.wv.__contains__(word):
        embedding_vector = embeddings_index.wv.get_vector(word)
        if embedding_vector is not None:
#             print('--> {}'.format(i))
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
args['embedding_matrix']=torch.Tensor(embedding_matrix)



In [23]:
print(args['embedding_matrix'][4])

tensor([ 0.1322, -0.2534, -0.4950, -0.0790, -0.2268, -0.6080,  0.7234,
        -1.6187, -0.5441,  0.7554, -0.2016, -0.0341,  0.4113,  0.4508,
         0.2651,  0.0046,  0.5257,  0.6460,  0.0834, -0.0562])


In [113]:
# 开始训练
model_path = 'D:/PROJECT_TW/git/data/nlp/w2v/simple_text_cnn.pkl'
EPOCH=500;
#构建textCNN模型
cnn=textCNN(args)
if os.path.exists(model_path):
    print('load saved model ... ')
    cnn.load_state_dict(torch.load(model_path))

max_len=64
#生成训练数据，需要将训练数据的Word转换为word的索引, 每段话最多只取64个词
for i in range(0,len(texts)):
    if len(texts[i])<max_len:
        for j in range(0,len(texts[i])):
            texts_with_id[i][j]=word_to_idx[texts[i][j]]
        for j in range(len(texts[i]),max_len):
            texts_with_id[i][j] = word_to_idx['']
    else:
        for j in range(0,max_len):
            texts_with_id[i][j]=word_to_idx[texts[i][j]]

LR = 0.001
optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)
#损失函数
loss_function = nn.CrossEntropyLoss()
#训练批次大小
epoch_size=10;
texts_len=len(texts_with_id)
print(texts_len)
#划分训练数据和测试数据  Sklearn-train_test_split随机划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(texts_with_id, labels, test_size=0.2, random_state=42)

test_x=torch.LongTensor(x_test)
test_y=torch.LongTensor(y_test)
train_x=x_train
train_y=y_train

test_epoch_size=300;
for epoch in range(EPOCH):
    cnn.train()
    for i in range(0,(int)(len(train_x)/epoch_size)):
#     for i in range(0,1):
        b_x = Variable(torch.LongTensor(train_x[i*epoch_size:i*epoch_size+epoch_size]))
        
        b_y = Variable(torch.LongTensor((train_y[i*epoch_size:i*epoch_size+epoch_size])))
        output = cnn(b_x)
        loss = loss_function(output, b_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
#         print(loss)
        pred_y = torch.max(output, 1)[1].data.squeeze()
        acc = (b_y == pred_y)
        acc = acc.numpy().sum()
        accuracy = acc / (b_y.size(0))

    acc_all = 0
    cnn.eval()
    for j in range(0, (int)(len(test_x) / test_epoch_size)):
#     for j in range(0, 1):
        b_x = Variable(torch.LongTensor(test_x[j * test_epoch_size:j * test_epoch_size + test_epoch_size]))
        b_y = Variable(torch.LongTensor((test_y[j * test_epoch_size:j * test_epoch_size + test_epoch_size])))
        test_output = cnn(b_x)
        pred_y = torch.max(test_output, 1)[1].data.squeeze()
#         print(pred_y)
#         print(test_y)
        acc = (pred_y == b_y)
        acc = acc.numpy().sum()
        print("acc " + str(acc / b_y.size(0)))
        acc_all = acc_all + acc

    accuracy = acc_all / (test_y.size(0))
    print('{} loss -->  {} '.format(epoch, loss))
    print("epoch " + str(epoch) + " step " + str(i) + " " + "acc " + str(accuracy))
    torch.save(cnn.state_dict(),model_path)

2509
acc 0.57
0 loss -->  0.24468156695365906 
epoch 0 step 199 acc 0.34063745019920316
acc 0.6233333333333333
1 loss -->  0.12069165706634521 
epoch 1 step 199 acc 0.37250996015936255
acc 0.6433333333333333
2 loss -->  0.017282620072364807 
epoch 2 step 199 acc 0.3844621513944223
acc 0.6533333333333333
3 loss -->  0.006407138891518116 
epoch 3 step 199 acc 0.3904382470119522
acc 0.6533333333333333
4 loss -->  0.0002040023246081546 
epoch 4 step 199 acc 0.3904382470119522
acc 0.65
5 loss -->  0.19347572326660156 
epoch 5 step 199 acc 0.3884462151394422
acc 0.6366666666666667
6 loss -->  0.0001253441150765866 
epoch 6 step 199 acc 0.3804780876494024
acc 0.6366666666666667
7 loss -->  0.01363480556756258 
epoch 7 step 199 acc 0.3804780876494024
acc 0.6033333333333334
8 loss -->  0.6101404428482056 
epoch 8 step 199 acc 0.3605577689243028
acc 0.6566666666666666
9 loss -->  8.731277745255284e-08 
epoch 9 step 199 acc 0.39243027888446214
acc 0.6533333333333333
10 loss -->  9.639929032800865

epoch 168 step 199 acc 0.4243027888446215
acc 0.7166666666666667
169 loss -->  2.2043824543516166e-08 
epoch 169 step 199 acc 0.42828685258964144
acc 0.7033333333333334
170 loss -->  4.380496074381235e-13 
epoch 170 step 199 acc 0.4203187250996016
acc 0.6833333333333333
171 loss -->  1.1586820045095259e-11 
epoch 171 step 199 acc 0.40836653386454186
acc 0.67
172 loss -->  1.287731365984257e-09 
epoch 172 step 199 acc 0.40039840637450197
acc 0.68
173 loss -->  5.9531334528628577e-08 
epoch 173 step 199 acc 0.4063745019920319
acc 0.66
174 loss -->  1.93508906676243e-08 
epoch 174 step 199 acc 0.3944223107569721
acc 0.6766666666666666
175 loss -->  7.669941126664526e-09 
epoch 175 step 199 acc 0.4043824701195219
acc 0.6733333333333333
176 loss -->  7.292720649587636e-09 
epoch 176 step 199 acc 0.40239043824701193
acc 0.6733333333333333
177 loss -->  7.142554547812097e-09 
epoch 177 step 199 acc 0.40239043824701193
acc 0.6766666666666666
178 loss -->  6.880153335941941e-09 
epoch 178 step 

acc 0.71
255 loss -->  0.0 
epoch 255 step 199 acc 0.4243027888446215
acc 0.7133333333333334
256 loss -->  0.0 
epoch 256 step 199 acc 0.4262948207171315
acc 0.7066666666666667
257 loss -->  2.2843950038889393e-13 
epoch 257 step 199 acc 0.42231075697211157
acc 0.6933333333333334
258 loss -->  1.9184654204335884e-14 
epoch 258 step 199 acc 0.41434262948207173
acc 0.6933333333333334
259 loss -->  3.907985131383846e-15 
epoch 259 step 199 acc 0.41434262948207173
acc 0.6966666666666667
260 loss -->  2.4868995328087033e-15 
epoch 260 step 199 acc 0.4163346613545817
acc 0.7
261 loss -->  1.7763568394002505e-15 
epoch 261 step 199 acc 0.41832669322709165
acc 0.7
262 loss -->  1.421085492696024e-15 
epoch 262 step 199 acc 0.41832669322709165
acc 0.7
263 loss -->  1.0658141459917976e-15 
epoch 263 step 199 acc 0.41832669322709165
acc 0.7066666666666667
264 loss -->  2.1316282919835953e-15 
epoch 264 step 199 acc 0.42231075697211157
acc 0.71
265 loss -->  1.0658141459917976e-15 
epoch 265 step 

KeyboardInterrupt: 

# 验证

In [122]:
cnn=textCNN(args)
if os.path.exists(model_path):
    cnn.load_state_dict(torch.load(model_path))
    
testDoc = '''
专家提醒：留学忌盲目做好职业规划
　　人民网·天津视窗1月3日电：记者在采访中了解到，在今年严峻的就业形势下，一部分大学毕业生选择考研、出国深造，以此应对金融危机带来的就业压力。
　　在采访中，很多大学毕业生表示，他们希望选择读研或出国留学深造来规避就业压力。南开大学商学院市场营销专业的应届大学生陈凯告诉记者：“现在我正准备考研，提高自己各方面的理论知识和实践技能，换一个环境来增加自己的阅历。”此外，现在出国留学费用比以前便宜不少，出国留学也成了不少毕业生的选择。
'''    
# 提取中文, 分词， 得到词向量
line = extract_chinese(testDoc)
words = jieba.lcut(line, cut_all=False, HMM=True)
words_model = getw2v()
words_ids = [word_to_idx[x] for x in words if x in word_to_idx]
words_ids = words_ids[0:64]
words_ids = torch.LongTensor(words_ids)
words_ids = words_ids.unsqueeze(0)
pred_probs =  cnn(words_ids)
print(pred_probs)
pred_label = index_lables[torch.max(pred_probs,1)[1].data.item()]
print(index_lables)
print('idx {} labels  --> {}'.format(torch.max(pred_probs,1)[1].data.item(), pred_label))

tensor([[ -55.1157,  -55.8341,  -26.0998,    4.1878, -118.9738,  -45.4318]])
{0: '家居', 1: '彩票', 2: '房产', 3: '教育', 4: '股票', 5: '财经'}
idx 3 labels  --> 教育


In [78]:
print(torch.max(pred_probs,1)[1].data.item())
print(index_lables)


2
{0: '家居', 1: '彩票', 2: '房产', 3: '教育', 4: '股票', 5: '财经'}
