In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import jieba

# 1、读取数据

In [3]:
df = pd.read_csv('./data/train.csv')

In [4]:
all_text = df['question1'].tolist() + df['question2'].tolist()

In [5]:
all_text = [' '.join(jieba.lcut(i)) for i in all_text]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/4f/59cvb05d53b72qpx5q3tjx1m0000gn/T/jieba.cache
Loading model cost 0.534 seconds.
Prefix dict has been built succesfully.


In [6]:
print(jieba.lcut('你好吗今天我来大连理工大学'))

['你好', '吗', '今天', '我来', '大连理工大学']


In [7]:
print(jieba.cut('你好吗今天我来大连理工大学'))

<generator object Tokenizer.cut at 0x11eb69480>


In [8]:
test = jieba.cut('你好吗今天我来大连理工大学')
dir(test)

['__class__',
 '__del__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__name__',
 '__ne__',
 '__new__',
 '__next__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'close',
 'gi_code',
 'gi_frame',
 'gi_running',
 'gi_yieldfrom',
 'send',
 'throw']

# 2、数据向量化

In [9]:
tfidf_obj = TfidfVectorizer()

In [10]:
tfidf_obj.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [11]:
# Transform documents to document-term matrix.
# Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform).
ques1_matrix = tfidf_obj.transform(all_text[:20000])
ques2_matrix = tfidf_obj.transform(all_text[20000:])

In [12]:
ques1_matrix

<20000x6957 sparse matrix of type '<class 'numpy.float64'>'
	with 87194 stored elements in Compressed Sparse Row format>

In [13]:
ques2_matrix

<20000x6957 sparse matrix of type '<class 'numpy.float64'>'
	with 90869 stored elements in Compressed Sparse Row format>

In [14]:
ques2_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
import numpy as np

In [16]:
feature_matrix = np.concatenate([ques1_matrix.toarray(),ques2_matrix.toarray()],axis=1) # concat horizontally

In [17]:
feature_matrix.shape

(20000, 13914)

# 3、特征导入模型

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
train_x = feature_matrix[:16000]

In [20]:
train_y = df['label'][:16000].tolist()

In [21]:
test_x = feature_matrix[16000:]

In [22]:
test_y = df['label'][16000:].tolist()

In [23]:
lr = LogisticRegression()

# 3.1、模型训练

In [24]:
lr.fit(train_x,train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# 3.2、模型预测

In [25]:
pred_y = lr.predict(test_x)

# 4、模型评估

In [26]:
from sklearn.metrics import f1_score

In [27]:
f1_score(test_y,pred_y)

0.5629040278468423

# 5、提高结果

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rf = RandomForestClassifier()

In [30]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
pred_rf_y = rf.predict(test_x)

In [32]:
f1_score(test_y,pred_rf_y)

0.5812781755502519

# 5.1、参数调优

In [33]:
rf1 = RandomForestClassifier(max_leaf_nodes=3000)

In [34]:
rf1.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=3000,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
pred_rf_y = rf1.predict(test_x)

In [36]:
f1_score(test_y,pred_rf_y)

0.6132284238996802

# 5.2、xgboost模型

时间比较久

In [37]:
import xgboost

In [38]:
xgb = xgboost.XGBClassifier(n_jobs=2)

In [39]:
xgb.fit(train_x,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=2,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [40]:
pred_xgb_y = xgb.predict(test_x)

### 结果不太好，可能过拟合

后续对数据再进行一下改进

In [41]:
f1_score(test_y,pred_xgb_y)

0.44046434494195685

In [42]:
rf = RandomForestClassifier()

In [43]:
rf.fit(train_x,train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [44]:
pred_rf_y = rf.predict(test_x)

In [45]:
f1_score(test_y,pred_rf_y)

0.5918904686677198

# 训练词向量

In [46]:
import gensim

In [47]:
model = gensim.models.word2vec.Word2Vec()

In [48]:
model.train()

RuntimeError: you must first build vocabulary before training the model