In [1]:
import tensorflow as tf

from tensorflow.keras.layers import Dense, Flatten

In [2]:
import jieba
import gensim

import matplotlib.pyplot as plt
import numpy as np

from utils.data_loader import load_data
from tqdm import tqdm



In [3]:
def load_stopwords(path_to_file):
    stop_words=set()
    with open(path_to_file,encoding="utf-8") as f:
        content=f.readlines()
    for word in content:
        stop_words.add(word.strip('\n'))
        
    return stop_words


# In[61]:


def cut_sentence(sentence):
    """
    分词，去停用词，返回一个列表
    """
    result=[]
    for word in jieba.lcut(sentence):
        if word not in stop_words:
            result.append(word)
            
    return result


# In[62]:


def load_word2vec(path_to_file):
    print("加载词向量...")
    return gensim.models.KeyedVectors.load_word2vec_format(path_to_file,binary=False)


# In[64]:


def random_vector(seed):
    np.random.seed(seed)
    vec=-1 + 2*np.random.random((300))
    vec=vec.astype('float64')
    
    return vec


# In[65]:


def get_vector(word,seed=108):
    try:
        return word2vec[word]
    except:
        return random_vector(seed)


# In[66]:


def sentence2vector(word_list):
    result=[]
    for w in word_list:
        result.append(get_vector(w))
        
    return result


# In[68]:


def get_label(dataset):
    label=[]
    for d in dataset:
        if(d[1]!=5):
            label.append(d[1])
        else:
            label.append(0)
    label=np.array(label,dtype='uint8')
    
    return label


# In[91]:


# 向量化
def vectorize(dataset):
    dataset_new=[]
    for d in tqdm(dataset):
        dataset_new.append(sentence2vector(cut_sentence(d[0])))
        
    return dataset_new

# 截断和补0
padding=np.zeros(300,dtype='float64')

def unify(dataset,max_len):
    for i in tqdm(range(len(dataset))):
        if len(dataset[i])==max_len:
            pass
        elif len(dataset[i])<max_len:
            while(len(dataset[i])!=max_len):
                dataset[i].append(padding)
        else:
            dataset[i]=dataset[i][:max_len]
            
def array2np(dataset):
    for index in tqdm(range(len(dataset))):
        dataset[index]=np.array(dataset[index])

In [4]:
stop_words=load_stopwords('./src/hit_stopwords.txt')
word2vec=load_word2vec('../sgns.zhihu.word.bz2')

加载词向量...


In [5]:
train,test=load_data('./augmented/')

加载完成，测试集比例0.2
训练集198566条
测试集49642条


In [7]:
train_text=vectorize(train)

  0%|                                                                                                             | 0/198566 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\liufeng\AppData\Local\Temp\jieba.cache
Loading model cost 0.590 seconds.
Prefix dict has been built succesfully.
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 198566/198566 [00:38<00:00, 5224.24it/s]


In [8]:
unify(train_text,30)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 198566/198566 [00:00<00:00, 480514.88it/s]


In [9]:
train_label=get_label(train)

In [14]:
array2np(train_text)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 198566/198566 [00:11<00:00, 17117.46it/s]


In [19]:
x_train=np.array(train_text)
y_train=train_label

In [20]:
test_text=vectorize(test)
unify(test_text,30)
test_label=get_label(test)
array2np(test_text)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 49642/49642 [00:14<00:00, 3407.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 49642/49642 [00:00<00:00, 266666.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 49642/49642 [00:10<00:00, 4521.23it/s]


In [21]:
x_test=np.array(test_text)
y_test=test_label

In [22]:
MLP=tf.keras.models.Sequential([
    Flatten(),
    Dense(128,activation='relu'),
    Dense(64,activation='relu'),
    Dense(4,activation='softmax')
])

In [23]:
MLP.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=['sparse_categorical_accuracy']
)

In [24]:
MLP.fit(x_train,y_train,batch_size=32,epochs=20,validation_data=(x_test,y_test),validation_freq=1)

Train on 198566 samples, validate on 49642 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x24f7306bd68>

In [25]:
p=tf.argmax(MLP.predict(x_test),axis=1)

In [26]:
def label_back(label):
    label_new=label.tolist()
    
    for i in range(len(label_new)):
        if label_new[i]==0:
            label_new[i]=5
            
    return label_new

In [27]:
label_truth=label_back(y_test)
label_predict=label_back(p.numpy())

In [28]:
from utils.score import score

In [29]:
score(label_truth,label_predict,output_filename='mlp_augmented.txt')

Unnamed: 0,Precision,Recall,F1
军事新闻,0.98,0.97,0.97
体育新闻,0.99,0.97,0.98
晋江小说,0.95,0.95,0.95
电影短评,0.94,0.94,0.94
综合,0.95,0.95,0.95



结果保存在mlp_augmented.txt
