In [1]:
import jieba
import pandas as pd

# 1、读取数据

In [2]:
df = pd.read_csv('./data/train.csv')

## 1.1、数据分析


In [3]:
df.head()    # 表格内容

Unnamed: 0,question1,question2,label,category
0,艾滋病窗口期会出现腹泻症状吗,头疼腹泻四肢无力是不是艾滋病,0,aids
1,由于糖尿病引起末梢神经炎，怎么根治？,糖尿病末梢神经炎的治疗方法,1,diabetes
2,H型高血压，是通所说的高血脂？,高血压引起脑出血怎么抢救治疗,0,hypertension
3,糖尿病跟尿毒症有什么区别？,糖尿病人，尿酸只有4.6是什么原因造成的？,0,diabetes
4,你好，我60岁，患高血压，80135，爱喝酸奶可以吗？,高血压糖尿病人可以喝牛奶吗？,1,hypertension


## 1.2、 数据合并，分词

In [4]:
all_text = df['question1'].tolist() + df['question2'].tolist()

In [6]:
all_text[:3]

['艾滋病窗口期会出现腹泻症状吗', '由于糖尿病引起末梢神经炎，怎么根治？', 'H型高血压，是通所说的高血脂？']

In [7]:
all_text = [' '.join(jieba.lcut(i)) for i in all_text]

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/jh/r6n9d7bx1xsbyxmrvz4968zh0000gp/T/jieba.cache
Loading model cost 0.843 seconds.
Prefix dict has been built succesfully.


In [8]:
all_text[:3]

['艾滋病 窗口期 会 出现 腹泻 症状 吗',
 '由于 糖尿病 引起 末梢神经 炎 ， 怎么 根治 ？',
 'H 型 高血压 ， 是 通 所说 的 高血脂 ？']

过滤停用词（xgboost效果一般）

## 1.3、 停用词表

In [10]:
stoplist = {i.strip() for i in open('stopword.txt',encoding='utf-8').readlines()} # this may create a set directly
set([1, 2, 3 , 5]) # this first creates a list, then converts to a set

{1, 2, 3, 5}

In [11]:
all_text = [' '.join([word for word in document.split() if word not in stoplist]) for document in all_text]

In [13]:
all_text[:3]

['艾滋病 窗口期 会 出现 腹泻 症状', '糖尿病 引起 末梢神经 炎 根治', 'H 型 高血压 通 所说 高血脂']

# 2、数据向量化

## 2.1、特征抽取 

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf_obj = TfidfVectorizer()

fit方法用于构建特征空间（也就是构建词典）

In [17]:
tfidf_obj.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

transform方法使用该空间将文本数据转化为特征矩阵


In [19]:
# Transform documents to document-term matrix.
# Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform).
# ques1_matrix is the data from question 1
# ques2_matrix is the data from question 2
ques1_matrix = tfidf_obj.transform(all_text[:20000]) 
ques2_matrix = tfidf_obj.transform(all_text[20000:])

In [20]:
ques1_matrix

<20000x6847 sparse matrix of type '<class 'numpy.float64'>'
	with 75065 stored elements in Compressed Sparse Row format>

In [21]:
ques2_matrix

<20000x6847 sparse matrix of type '<class 'numpy.float64'>'
	with 78526 stored elements in Compressed Sparse Row format>

In [22]:
ques2_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

降维？

## 2.2、数据拼接

In [28]:
import numpy as np

In [29]:
feature_matrix = np.concatenate([ques1_matrix.toarray(),ques2_matrix.toarray()],axis=1) # concat horizontally

In [30]:
feature_matrix.shape

(20000, 13694)

In [48]:
feature_matrix[0:3]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# 3、特征导入模型（机器学习）

## 3.1、数据集划分（训练集，测试集）

In [35]:
train_x = feature_matrix[:16000]

In [49]:
train_x[0] # 两个 list合并后的特征

array([0., 0., 0., ..., 0., 0., 0.])

In [36]:
train_y = df['label'][:16000].tolist()   # answer 0 or 1

In [50]:
train_y[0]

0

In [37]:
test_x = feature_matrix[16000:]

In [38]:
test_y = df['label'][16000:].tolist()

In [39]:
from sklearn.metrics import f1_score

## 3.2、逻辑回归

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
lr = LogisticRegression(n_jobs=4)

In [51]:
lr.fit(train_x,train_y)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=4, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [52]:
pred_y = lr.predict(test_x)

In [54]:
pred_y[:3]

array([1, 1, 1])

In [56]:
f1_score(test_y,pred_y)

0.5689484126984126

## 3.3、随机森林

In [57]:
from sklearn.ensemble import RandomForestClassifier

In [58]:
rf = RandomForestClassifier(n_jobs=4)

In [59]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [60]:
pred_rf_y = rf.predict(test_x)

In [61]:
f1_score(test_y,pred_rf_y)

0.5817115435883765

## 3.4 MultinomialNB

In [62]:
from sklearn.naive_bayes import MultinomialNB

In [63]:
mnb = MultinomialNB()

In [64]:
mnb.fit(train_x,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [65]:
pred_mnb_y = mnb.predict(test_x)

In [66]:
f1_score(test_y,pred_mnb_y)

0.5522949586155005

## 3.5 Decision Tree Classifier

In [67]:
from sklearn.tree import DecisionTreeClassifier

In [68]:
dtc = DecisionTreeClassifier()

In [69]:
dtc.fit(train_x,train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [70]:
pred_dtc_y = dtc.predict(test_x)

In [71]:
f1_score(test_y,pred_dtc_y)

0.5918723510346547

# 4、参数调优

In [72]:
rf1 = RandomForestClassifier(max_leaf_nodes=3000, n_jobs=4)

In [73]:
rf1.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=3000,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [74]:
pred_rf_y = rf1.predict(test_x)

In [75]:
f1_score(test_y,pred_rf_y)

0.6105675146771037

# 5.1、xgboost模型

时间比较久

In [39]:
import xgboost

In [40]:
xgb = xgboost.XGBClassifier(n_jobs=8)

In [41]:
xgb.fit(train_x,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=8,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [42]:
pred_xgb_y = xgb.predict(test_x)

### 结果不太好，可能过拟合

后续对数据再进行一下改进

In [43]:
f1_score(test_y,pred_xgb_y)

0.44046434494195685

In [44]:
rf = RandomForestClassifier(n_jobs=4)

In [45]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [46]:
pred_rf_y = rf.predict(test_x)

In [47]:
f1_score(test_y,pred_rf_y)

0.5761012925349512

# 训练词向量

In [76]:
import gensim
import logging
import os.path
import sys
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import word2vec
from gensim.models.word2vec import LineSentence
from gensim.models import KeyedVectors



In [78]:
all_text[:3]

['艾滋病 窗口期 会 出现 腹泻 症状', '糖尿病 引起 末梢神经 炎 根治', 'H 型 高血压 通 所说 高血脂']

In [84]:
all_text_1 = df['question1'].tolist()
all_text_2 = df['question2'].tolist()
all_text_jieba = [jieba.lcut(q1) for q1 in all_text_1] + [jieba.lcut(q2) for q2 in all_text_2]


In [85]:
all_text_jieba[0:3]
model = word2vec.Word2Vec(all_text_jieba, hs=1,min_count=5,window=10,size=100)


In [86]:
for key in model.wv.similar_by_word('末梢神经', topn =10):
    print(key)

('`', 0.8568969964981079)
('神经病', 0.7268336415290833)
('由', 0.6792821884155273)
('病变', 0.6747841835021973)
('脱落', 0.6728172302246094)
('胃轻瘫', 0.6696387529373169)
('下肢', 0.66072016954422)
('由于', 0.6573159098625183)
('水肿', 0.655470609664917)
('腰痛', 0.654437780380249)
