In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Embedding, Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize

In [2]:
data = pd.read_excel(r'D:\datasets\Chinese_NLP6474\复旦大学中文文本分类语料.xlsx', 'sheet1')

In [3]:
data.head()

Unnamed: 0,分类,正文
0,艺术,﻿【 文献号 】1-2432\n【原文出处】出版发行研究\n【原刊地名】京\n【原刊期号】1...
1,艺术,﻿【 文献号 】1-2435\n【原文出处】扬州师院学报：社科版\n【原刊期号】199504...
2,艺术,﻿【 文献号 】1-2785\n【原文出处】南通师专学报：社科版\n【原刊期号】199503...
3,艺术,﻿【 文献号 】1-3021\n【原文出处】社会科学战线\n【原刊地名】长春\n【原刊期号】...
4,艺术,﻿【 文献号 】1-3062\n【原文出处】上海文化\n【原刊期号】199505\n【原刊页...


In [4]:
import jieba
# jieba.enable_parallel(64)
data['文本分词'] = data['正文'].apply(lambda i: jieba.cut(i))

In [5]:
data['文本分词'] =[' '.join(i) for i in data['文本分词']]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.701 seconds.
Prefix dict has been built successfully.


In [6]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    '''
    对数损失度量
    actual： 包含actual target classes的数组
    predicted：分类预测的结果矩阵，每个类别都有一个概率
    '''
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [7]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.分类.values)

In [8]:
# 将数据分为训练集合验证集
xtrain, xvalid, ytrain, yvalid = train_test_split(data.文本分词.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [9]:
stop_word_list = [line.strip() for line in open(r'D:\datasets\baidu_stopwords.txt', 'r', encoding='utf-8').readlines()]

In [12]:
def number_normalizer(tokens):
    '''
    将所有数字标记映射为一个占位符（Placeholder）。
    对于许多实际应用场景来说，以数字开头的tokens不是很有用，
    但这样tokens的存在也有一定相关性。 通过将所有数字都表示成同一个符号，可以达到降维的目的。
    '''
    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)

class NumerNormaalizeingVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumerNormaalizeingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

In [13]:
tfv = NumerNormaalizeingVectorizer(min_df=3, max_df=0.5,
                                    max_features=None,
                                    ngram_range=(1, 2),
                                    use_idf=True,
                                    smooth_idf=True,
                                    stop_words=stop_word_list)
# 使用TF-IDF来fit测试集合验证集
tfv.fit(list(xtrain) + list(xvalid))
x_train_tfv = tfv.transform(xtrain)
x_valid_tfv = tfv.transform(xvalid)

In [14]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200,
                        colsample_bytree=0.8, subsample=0.8,
                        nthread=10, learning_rate=0.1)
clf.fit(x_train_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(x_valid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.177


### 使用Word2Vec

In [15]:
X=data['文本分词']
X=[i.split() for i in X]

In [22]:
import gensim

model = gensim.models.Word2Vec(X, min_count=5, window=8, size=100)
embeddings_index = dict(zip(model.wv.index2word, model.wv.vectors))

print('Found %s word vectors.' % len(embeddings_index))

Found 119790 word vectors.


In [27]:
def sent2vec(s):
    import jieba
    # jieba.enable_parallel()
    words = str(s).lower()

    words = jieba.cut(words)
    words = [w for w in words if w not in stop_word_list]

    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            pass
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt(( v ** 2).sum())

In [28]:
# 对训练集和验证集使用上述函数，进行文本向量化处理
xtrain_w2v = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_w2v = [sent2vec(x) for x in tqdm(xvalid)]

xtrain_w2v = np.array(xtrain_w2v)
xvalid_w2v = np.array(xvalid_w2v)

369/925 [00:53<01:03,  8.71it/s]


 40%|████      | 372/925 [00:53<01:00,  9.09it/s]


 40%|████      | 374/925 [00:53<01:01,  8.90it/s]


 41%|████      | 376/925 [00:54<01:06,  8.21it/s]


 41%|████      | 377/925 [00:54<01:12,  7.54it/s]


 41%|████      | 378/925 [00:54<01:11,  7.61it/s]


 41%|████      | 379/925 [00:54<01:07,  8.06it/s]


 41%|████      | 381/925 [00:54<01:10,  7.73it/s]


 41%|████▏     | 382/925 [00:55<01:08,  7.92it/s]


 41%|████▏     | 383/925 [00:55<01:05,  8.33it/s]


 42%|████▏     | 385/925 [00:55<01:03,  8.49it/s]


 42%|████▏     | 387/925 [00:55<01:19,  6.75it/s]


 42%|████▏     | 388/925 [00:55<01:14,  7.21it/s]


 42%|████▏     | 389/925 [00:56<01:21,  6.60it/s]


 42%|████▏     | 390/925 [00:56<01:18,  6.84it/s]


 42%|████▏     | 392/925 [00:56<01:06,  8.07it/s]


 42%|████▏     | 393/925 [00:56<01:29,  5.92it/s]


 43%|████▎     | 395/925 [00:56<01:22,  6.41it/s]


 43%|████▎     | 396/925 [00:57<01:53,  4.68it/s]


 43%|████▎     | 398/925 [00:

In [29]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200,
                        colsample_bytree=0.8, subsample=0.8,
                        nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_w2v, ytrain)
predictions = clf.predict_proba(xvalid_w2v)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


logloss: 0.414
