第06课：动手实战基于 ML 的中文短文本分类

In [1]:
import random
import jieba
import pandas as pd

In [2]:
#加载停用词
stopwords=pd.read_csv('stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values

In [3]:
stopwords

array(['!', '"', '#', ..., '450', '22549', '22544'], dtype=object)

In [4]:
#加载语料
laogong_df = pd.read_csv('../data/06/beilaogongda.csv', encoding='utf-8', sep=',')
laopo_df = pd.read_csv('../data/06/beilaogongda.csv', encoding='utf-8', sep=',')
erzi_df = pd.read_csv('../data/06/beierzida.csv', encoding='utf-8', sep=',')
nver_df = pd.read_csv('../data/06/beinverda.csv', encoding='utf-8', sep=',')
#删除语料的nan行
laogong_df.dropna(inplace=True)
laopo_df.dropna(inplace=True)
erzi_df.dropna(inplace=True)
nver_df.dropna(inplace=True)
#转换
laogong = laogong_df.segment.values.tolist()
laopo = laopo_df.segment.values.tolist()
erzi = erzi_df.segment.values.tolist()
nver = nver_df.segment.values.tolist()

In [5]:
#定义分词和打标签函数preprocess_text
#参数content_lines即为上面转换的list
#参数sentences是定义的空list，用来储存打标签之后的数据
#参数category 是类型标签
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = [v for v in segs if not str(v).isdigit()]#去数字
            segs = list(filter(lambda x:x.strip(), segs))   #去左右空格
            segs = list(filter(lambda x:len(x)>1, segs)) #长度为1的字符
            segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用词
            sentences.append((" ".join(segs), category))# 打标签
        except Exception:
            print(line)
            continue 

In [6]:
sentences = []
preprocess_text(laogong, sentences,0)
preprocess_text(laopo, sentences, 1)
preprocess_text(erzi, sentences, 2)
preprocess_text(nver, sentences, 3)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/yw/k7z_d_3567g16ss9plk47x9w0000gn/T/jieba.cache
Loading model cost 0.661 seconds.
Prefix dict has been built succesfully.


In [7]:
random.shuffle(sentences)

In [8]:
for sentence in sentences[:10]:
        print(sentence[0], sentence[1])  #下标0是词列表，1是标签

老公 人伤 通知 救护车 致电 持械 情况不明 民警 携带 防护 装备 1
报警 女儿 无需 民警 到场 3
报警 人称 女儿 持械 人伤 情况 不详 民警 到场 3
报警 老公 民警 到场 0
报警 丈夫 持械 人伤 无需 救护 民警 到场 民警 防护 设备 1
报警 老公 棍子 人伤 民警 携带 防护 装备 到场 1
报警 人称 儿子 民警 到场 2
报警 女儿 人伤 民警 到场 携带 防护 装备 3
报警 老公 民警 到场 1
报警 儿子 民警 到场 2


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    analyzer='word', # tokenise by character ngrams
    max_features=4000,  # keep the most common 1000 ngrams
)

In [11]:
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1256)

In [12]:
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=4000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
print(classifier.score(vec.transform(x_test), y_test))

0.6357308584686775


In [15]:
pre = classifier.predict(vec.transform(x_test))

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    analyzer='word', # tokenise by character ngrams
    ngram_range=(1,4),  # use ngrams of size 1 and 2
    max_features=20000,  # keep the most common 1000 ngrams
)
vec.fit(x_train)
#用朴素贝叶斯算法进行模型训练
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)
#对结果进行评分
print(classifier.score(vec.transform(x_test), y_test))

0.568445475638051


In [17]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(vec.transform(x_train), y_train)
print(svm.score(vec.transform(x_test), y_test))

0.5522041763341067


In [22]:
# import xgboost as xgb  
# from sklearn.model_selection import StratifiedKFold  
# import numpy as np
# # xgb矩阵赋值  
# xgb_train = xgb.DMatrix(vec.transform(x_train), label=y_train)  
# xgb_test = xgb.DMatrix(vec.transform(x_test)) 

  params = {  
            'booster': 'gbtree',     #使用gbtree
            'objective': 'multi:softmax',  # 多分类的问题、  
            # 'objective': 'multi:softprob',   # 多分类概率  
            #'objective': 'binary:logistic',  #二分类
            'eval_metric': 'merror',   #logloss
            'num_class': 4,  # 类别数，与 multisoftmax 并用  
            'gamma': 0.1,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。  
            'max_depth': 8,  # 构建树的深度，越大越容易过拟合  
            'alpha': 0,   # L1正则化系数  
            'lambda': 10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。  
            'subsample': 0.7,  # 随机采样训练样本  
            'colsample_bytree': 0.5,  # 生成树时进行的列采样  
            'min_child_weight': 3,  
            # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言  
            # 假设 h 在 0.01 附近，min_child_weight 为 1 叶子节点中最少需要包含 100 个样本。  
            'silent': 0,  # 设置成1则没有运行信息输出，最好是设置为0.  
            'eta': 0.03,  # 如同学习率  
            'seed': 1000,  
            'nthread': -1,  # cpu 线程数  
            'missing': 1 
        }  