In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import jieba

# 1、读取数据

In [3]:
df = pd.read_csv('./data/train.csv')

In [4]:
all_text = df['question1'].tolist() + df['question2'].tolist()

In [5]:
all_text = [' '.join(jieba.lcut(i)) for i in all_text]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/4f/59cvb05d53b72qpx5q3tjx1m0000gn/T/jieba.cache
Loading model cost 0.538 seconds.
Prefix dict has been built succesfully.


过滤停用词（xgboost效果一般）

In [6]:
# import a stopwrods list
# stoplist = {i.strip() for i in open('stopword.txt',encoding='utf-8').readlines()} # this may create a set directly
# set([1, 2, 3 , 5]) # this first creates a list, then converts to a set

In [7]:
# all_text[:3]

In [8]:
# all_text = [' '.join([word for word in document.split() if word not in stoplist]) for document in all_text]

In [9]:
# all_text[:100]

# 2、数据向量化

In [10]:
tfidf_obj = TfidfVectorizer()

In [11]:
tfidf_obj.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [12]:
# Transform documents to document-term matrix.
# Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform).
ques1_matrix = tfidf_obj.transform(all_text[:20000])
ques2_matrix = tfidf_obj.transform(all_text[20000:])

In [13]:
ques1_matrix

<20000x6957 sparse matrix of type '<class 'numpy.float64'>'
	with 87194 stored elements in Compressed Sparse Row format>

In [14]:
ques2_matrix

<20000x6957 sparse matrix of type '<class 'numpy.float64'>'
	with 90869 stored elements in Compressed Sparse Row format>

In [15]:
ques2_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
import numpy as np

In [17]:
feature_matrix = np.concatenate([ques1_matrix.toarray(),ques2_matrix.toarray()],axis=1) # concat horizontally

In [18]:
feature_matrix.shape

(20000, 13914)

# 3、特征导入模型

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
train_x = feature_matrix[:16000]

In [21]:
train_y = df['label'][:16000].tolist()

In [22]:
test_x = feature_matrix[16000:]

In [23]:
test_y = df['label'][16000:].tolist()

In [25]:
lr = LogisticRegression(n_jobs=4)

# 3.1、模型训练

In [26]:
lr.fit(train_x,train_y)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=4, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# 3.2、模型预测

In [27]:
pred_y = lr.predict(test_x)

# 4、模型评估

In [28]:
from sklearn.metrics import f1_score

In [29]:
f1_score(test_y,pred_y)

0.5629040278468423

# 5、提高结果

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rf = RandomForestClassifier(n_jobs=4)

In [32]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [33]:
pred_rf_y = rf.predict(test_x)

In [34]:
f1_score(test_y,pred_rf_y)

0.563950350782515

# 5.1、参数调优

In [35]:
rf1 = RandomForestClassifier(max_leaf_nodes=3000, n_jobs=4)

In [36]:
rf1.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=3000,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [37]:
pred_rf_y = rf1.predict(test_x)

In [38]:
f1_score(test_y,pred_rf_y)

0.6104417670682731

# 5.2、xgboost模型

时间比较久

In [39]:
import xgboost

In [40]:
xgb = xgboost.XGBClassifier(n_jobs=8)

In [41]:
xgb.fit(train_x,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=8,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [42]:
pred_xgb_y = xgb.predict(test_x)

### 结果不太好，可能过拟合

后续对数据再进行一下改进

In [43]:
f1_score(test_y,pred_xgb_y)

0.44046434494195685

In [44]:
rf = RandomForestClassifier(n_jobs=4)

In [45]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [46]:
pred_rf_y = rf.predict(test_x)

In [47]:
f1_score(test_y,pred_rf_y)

0.5761012925349512

# 训练词向量

In [6]:
import gensim
import logging
import os.path
import sys
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [22]:
all_text_1 = df['question1'].tolist()
all_text_2 = df['question2'].tolist()
all_text = [[' '.join(jieba.lcut(q1)), ' '.join(jieba.lcut(q2))] for q1, q2 in zip(all_text_1, all_text_2)]

In [26]:
model = Word2Vec()

In [27]:
model.build_vocab(all_text)

In [28]:
model.train(all_text, total_examples=model.corpus_count, epochs=model.iter)

  """Entry point for launching an IPython kernel.


(17842, 200000)

In [33]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_check_input_data_sanity',
 '_check_training_sanity',
 '_clear_post_train',
 '_do_train_epoch',
 '_do_train_job',
 '_get_job_params',
 '_get_thread_working_mem',
 '_job_producer',
 '_load_specials',
 '_log_epoch_end',
 '_log_epoch_progress',
 '_log_progress',
 '_log_train_end',
 '_minimize_model',
 '_raw_word_count',
 '_save_specials',
 '_set_train_params',
 '_smart_save',
 '_train_epoch',
 '_train_epoch_corpusfile',
 '_update_job_params',
 '_worker_loop',
 '_worker_loop_corpusfile',
 'accuracy',
 'alpha',
 'batch_words',
 'build_vocab',
 'build_vocab_from_freq',
 'ca

In [40]:
model.wv['']

KeyError: "word '' not in vocabulary"