# HSK语料级别自动分类

## 导入数据集预处理、特征工程和模型训练所需库

In [1]:
# -*- coding: utf-8 -*-
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import text
from sklearn import decomposition, ensemble
from sklearn.naive_bayes import MultinomialNB

import pandas as pd
import numpy as np
import textblob, string, jieba, re, os, sys
from keras import layers, models, optimizers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# 加载数据集
data = open('train.txt').read()
labels, qs_class, texts = [], [], []
content_seg = []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])# label
    qs_class.append(content[1])# 题型
    texts.append(re.sub('\W', '', content[2]))# 未分词的文本
    content_temp = jieba.cut(texts[i]) # 分词处理
    content_seg.append(" ".join(content_temp))# 分词存入content_seg

trainDF = pd.DataFrame()
labels.pop(0) # 删除数据第一行的“级别”,“text”
content_seg.pop(0)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/76/5s479vb92j38cy72xv2th3vm0000gn/T/jieba.cache
Loading model cost 0.707 seconds.
Prefix dict has been built succesfully.


'text'

In [3]:
# 加载HSK词汇表，作为另一部分的训练数据
HSK_df = pd.read_csv('HSK.csv')

train_X_add = HSK_df['text']
train_y_add = HSK_df['label'].tolist()
encoder = preprocessing.LabelEncoder()

train_y_add = encoder.fit_transform(train_y_add)

In [4]:
# 加载中文停用词表
word = open('stop_word_zh.txt').read()
st_word = []
for i, line in enumerate(word.split("\n")):
    content = line.split()
    st_word.append(content[0])
      
my_stop_words = text.ENGLISH_STOP_WORDS.union(st_word)

## 特征工程

In [37]:
# 使用TfidfVectorizer初始化向量空间模型

vectorizer = TfidfVectorizer(sublinear_tf = True, \
                             decode_error = 'ignore', stop_words = my_stop_words)
transformer = TfidfTransformer()# 统计每个词语的TF-IDF权值

# 文本转化为词频矩阵
content_seg.extend(train_X_add)
labels.extend(train_y_add)
content_tdm = vectorizer.fit_transform(content_seg)


In [38]:
# 将数据集分为训练集和验证集
train_X, test_X, train_y, test_y = model_selection.train_test_split(content_tdm, labels, test_size = 0.2)
train_y = encoder.fit_transform(train_y) # label编码为目标变量
test_y = encoder.fit_transform(test_y)

## 训练分类器

In [39]:
# 训练分类器
clf = MultinomialNB().fit(train_X, train_y)

pred_y = clf.predict(test_X)

def metrics_result(actual, predict):
    print('accuracy:{0:.3f}'.format(metrics.precision_score(actual, predict, average='weighted')))
#     print('recall:{0:0.3f}'.format(metrics.recall_score(actual, predict, average='weighted')))

metrics_result(test_y, pred_y)

accuracy:0.881


  'precision', 'predicted', average, warn_for)


In [56]:
# 预测结果
data_ans = open('test.txt').read()
qs_ans, texts_out, texts_ans, temp_ans, seg_ans = [], [], [], [], []
for i, line in enumerate(data_ans.split("\n")):
    content_ans = line.split()
    qs_ans.append(content_ans[0])
    texts_out.append(content_ans[1])
    texts_ans.append(re.sub('\W', '', content_ans[1]))
    temp_ans = jieba.cut(texts_ans[i]) # 分词处理
    seg_ans.append(" ".join(temp_ans))# 分词存入content_seg
seg_ans.pop(0)

ans_tdm = vectorizer.transform(seg_ans)

'text'