In [1]:
import jieba
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 1、读取数据

In [2]:
df = pd.read_csv('./data/train.csv')

In [3]:
all_text = df['question1'].tolist() + df['question2'].tolist()

In [4]:
all_text = [' '.join(jieba.lcut(i)) for i in all_text]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/4f/59cvb05d53b72qpx5q3tjx1m0000gn/T/jieba.cache
Loading model cost 0.553 seconds.
Prefix dict has been built succesfully.


过滤停用词（xgboost效果一般）

In [5]:
# import a stopwrods list
# stoplist = {i.strip() for i in open('stopword.txt',encoding='utf-8').readlines()} # this may create a set directly
# set([1, 2, 3 , 5]) # this first creates a list, then converts to a set

In [6]:
# all_text[:3]

In [7]:
# all_text = [' '.join([word for word in document.split() if word not in stoplist]) for document in all_text]

In [8]:
# all_text[:100]

# 2、数据向量化

In [9]:
tfidf_obj = TfidfVectorizer()

In [10]:
tfidf_obj.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [11]:
# Transform documents to document-term matrix.
# Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform).
ques1_matrix = tfidf_obj.transform(all_text[:20000])
ques2_matrix = tfidf_obj.transform(all_text[20000:])

In [12]:
ques1_matrix

<20000x6957 sparse matrix of type '<class 'numpy.float64'>'
	with 87194 stored elements in Compressed Sparse Row format>

In [13]:
ques2_matrix

<20000x6957 sparse matrix of type '<class 'numpy.float64'>'
	with 90869 stored elements in Compressed Sparse Row format>

In [14]:
ques2_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
import numpy as np

In [16]:
feature_matrix = np.concatenate([ques1_matrix.toarray(),ques2_matrix.toarray()],axis=1) # concat horizontally

In [17]:
feature_matrix.shape

(20000, 13914)

# 3、特征导入模型

In [18]:
train_x = feature_matrix[:16000]

In [19]:
train_y = df['label'][:16000].tolist()

In [20]:
test_x = feature_matrix[16000:]

In [21]:
test_y = df['label'][16000:].tolist()

In [22]:
from sklearn.metrics import f1_score

## 3.1、线性回归 (不可用)

In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
reg = LinearRegression(n_jobs=4)

In [None]:
reg.fit(train_x, train_y)

## 3.2、逻辑回归

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
lr = LogisticRegression(n_jobs=4)

In [25]:
lr.fit(train_x,train_y)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=4, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
pred_y = lr.predict(test_x)

In [29]:
f1_score(test_y,pred_y)

0.5629040278468423

## 3.3、随机森林

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rf = RandomForestClassifier(n_jobs=4)

In [28]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [29]:
pred_rf_y = rf.predict(test_x)

In [33]:
f1_score(test_y,pred_rf_y)

0.5747800586510263

## 3.4 Lasso (不可用)

In [32]:
from sklearn.linear_model import Lasso

In [34]:
las = Lasso(alpha=0.1)

In [35]:
las.fit(train_x,train_y)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [36]:
pred_las_y = las.predict(test_x)

In [37]:
f1_score(test_y,pred_las_y)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

## 3.5 KNN Classifier (有问题)

In [38]:
from sklearn.neighbors import KNeighborsClassifier

In [39]:
knc = KNeighborsClassifier(n_jobs=4)

In [40]:
knc.fit(train_x,train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=4, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
pred_knc_y = knc.predict(test_x)

## 3.6 Decision Tree Classifier

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [25]:
dtc = DecisionTreeClassifier()

In [26]:
dtc.fit(train_x,train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [27]:
pred_dtc_y = dtc.predict(test_x)

In [28]:
f1_score(test_y,pred_dtc_y)

0.5917602996254682

## 3.7 GBDT(Gradient Boosting Decision Tree) Classifier

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

In [31]:
gbc = GradientBoostingClassifier(n_estimators=200)

In [32]:
gbc.fit(train_x,train_y)

KeyboardInterrupt: 

# 5.1、参数调优

In [35]:
rf1 = RandomForestClassifier(max_leaf_nodes=3000, n_jobs=4)

In [36]:
rf1.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=3000,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [37]:
pred_rf_y = rf1.predict(test_x)

In [38]:
f1_score(test_y,pred_rf_y)

0.6104417670682731

# 5.2、xgboost模型

时间比较久

In [39]:
import xgboost

In [40]:
xgb = xgboost.XGBClassifier(n_jobs=8)

In [41]:
xgb.fit(train_x,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=8,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [42]:
pred_xgb_y = xgb.predict(test_x)

### 结果不太好，可能过拟合

后续对数据再进行一下改进

In [43]:
f1_score(test_y,pred_xgb_y)

0.44046434494195685

In [44]:
rf = RandomForestClassifier(n_jobs=4)

In [45]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [46]:
pred_rf_y = rf.predict(test_x)

In [47]:
f1_score(test_y,pred_rf_y)

0.5761012925349512

# 训练词向量

In [19]:
import gensim
import logging
import os.path
import sys
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import word2vec
from gensim.models.word2vec import LineSentence
from gensim.models import KeyedVectors



In [38]:
all_text_1 = df['question1'].tolist()
all_text_2 = df['question2'].tolist()

all_text_jieba = [jieba.lcut(q1) for q1 in all_text_1] + [jieba.lcut(q2) for q2 in all_text_2]


In [40]:
all_text_jieba[0:3]
model = word2vec.Word2Vec(all_text_jieba, hs=1,min_count=5,window=10,size=100)


In [41]:
for key in model.wv.similar_by_word('末梢神经', topn =100):
    print(key)

('`', 0.8035902976989746)
('神经病', 0.7125338912010193)
('根部', 0.6986055970191956)
('由', 0.6907028555870056)
('偏瘫', 0.6834913492202759)
('水肿', 0.6791844367980957)
('脱落', 0.6755310297012329)
('炎', 0.6748406887054443)
('下肢', 0.6700626611709595)
('腰痛', 0.6488524675369263)
('糖尿病足', 0.6457810401916504)
('胃轻瘫', 0.6388941407203674)
('缓解', 0.6323925852775574)
('症', 0.6206598281860352)
('神经痛', 0.6180165410041809)
('病变', 0.6153590083122253)
('腿疼', 0.6138206720352173)
('由于', 0.6073352098464966)
('溃烂', 0.6060768365859985)
('黄斑', 0.604935884475708)
('因', 0.5988547801971436)
('肾性', 0.5925606489181519)
('目的', 0.5787270069122314)
('神经炎', 0.5744316577911377)
('病发症', 0.5733441710472107)
('足部', 0.5715343952178955)
('脱离', 0.5636096596717834)
('干结', 0.5609884262084961)
('肾衰竭', 0.5600886344909668)
('部', 0.559586763381958)
('冶疗', 0.556767463684082)
('脚烂', 0.5564866065979004)
('玻璃体', 0.5556192994117737)
('堵塞', 0.5553321838378906)
('尿毒症', 0.5540518760681152)
('青年', 0.5536859631538391)
('脚部', 0.5512212514877319)
