In [1]:
import os
import jieba
import numpy as np
import pandas as pd
from pprint import pprint
from functools import reduce
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
# classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [2]:
%%time
excel = pd.ExcelFile(r'.\..\..\data\爬虫数据.xlsx')
sheets = excel.sheet_names
docs = [{
    'name': name,
    'texts':[doc for doc in list(excel.parse(name).desc) if type(doc) is str],
} for name in sheets]

Wall time: 674 ms


In [3]:
[len(doc['texts']) for doc in docs]

[979, 979, 1354, 902, 933, 1247, 958]

In [4]:
[doc['name'] for doc in docs]

['财经', '科技', '汽车', '娱乐', '食品', '房产', '体育']

In [5]:
%%time
# features
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda text: jieba.lcut(text), lowercase=False)
all_texts  = list(reduce(lambda x, y: x + y, [doc['texts'] for doc in docs]))

tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)
tfidf_matrix.shape

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zjxua\AppData\Local\Temp\jieba.cache
Loading model cost 0.747 seconds.
Prefix dict has been built succesfully.


Wall time: 37 s


In [6]:
labels = np.hstack([np.full(len(doc['texts']), i) for i, doc in enumerate(docs)])
labels.shape

(7352,)

In [7]:
train_flag = np.random.uniform(0, 1, len(labels)) <= .75
train_flag.shape

(7352,)

In [8]:
train, test = tfidf_matrix[train_flag == True], tfidf_matrix[train_flag == False]
train_label , test_label = labels[train_flag == True], labels[train_flag == False]
print("shape\t\twhole dataset\ttrain\t\ttest")
print("features: \t{0}\t{1}\t{2}".format(tfidf_matrix.shape, train.shape, test.shape))
print("labels: \t{0}\t\t{1}\t\t{2}".format(labels.shape, train_label.shape, test_label.shape))

shape		whole dataset	train		test
features: 	(7352, 129403)	(5510, 129403)	(1842, 129403)
labels: 	(7352,)		(5510,)		(1842,)


# RandomForestClassifier
泛化能力极强的模型，即使是稀疏数据表现也不差

In [9]:
%%time
clf = RandomForestClassifier(n_jobs=2)
print(clf.fit(train, train_label))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Wall time: 1.27 s


In [10]:
%%time
predict = clf.predict(test)
print("f1 score: ", f1_score(test_label, predict, labels=range(len(sheets)), average='micro'))

f1 score:  0.791530944625
Wall time: 134 ms


# KNN
* K值默认为5

简单的方法莫名的好用，效果目前而言最好

In [11]:
%%time
clf = KNeighborsClassifier()
print(clf.fit(train, train_label))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Wall time: 14 ms


In [12]:
%%time
predict = clf.predict(test)
print("f1 score: ", f1_score(test_label, predict, labels=range(len(sheets)), average='micro'))

f1 score:  0.867535287731
Wall time: 1.22 s


# 线性 SVM

训练时间预测时间都极长

也许是调参没调好，但是调一次参要太久了，先不搞

In [13]:
%%time
clf = SVC(kernel="linear", C=0.025)
print(clf.fit(train, train_label))

SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Wall time: 1min 42s


In [14]:
%%time
predict = clf.predict(test)
print("f1 score: ", f1_score(test_label, predict, labels=range(len(sheets)), average='micro'))

f1 score:  0.574918566775
Wall time: 20.5 s


In [15]:
%%time
clf = SVC(gamma=2, C=1)
print(clf.fit(train, train_label))

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=2, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Wall time: 1min 42s


In [16]:
%%time
predict = clf.predict(test)
print("f1 score: ", f1_score(test_label, predict, labels=range(len(sheets)), average='micro'))

f1 score:  0.916938110749
Wall time: 18.3 s


#  AdaBoost
默认算法SAMME.R表现极差

In [17]:
%%time
clf = AdaBoostClassifier()
print(clf.fit(train, train_label))

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Wall time: 28.3 s


In [18]:
%%time
predict = clf.predict(test)
print("f1 score: ", f1_score(test_label, predict, labels=range(len(sheets)), average='micro'))

f1 score:  0.445168295331
Wall time: 200 ms


In [19]:
%%time
clf = AdaBoostClassifier(algorithm='SAMME')
print(clf.fit(train, train_label))

AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=None)
Wall time: 28.6 s


In [20]:
%%time
predict = clf.predict(test)
print("f1 score: ", f1_score(test_label, predict, labels=range(len(sheets)), average='micro'))

f1 score:  0.795874049946
Wall time: 176 ms


# 朴素贝叶斯
* 不行，朴素贝叶斯需要dense data, 而文档特征是sparse的

emmmmmmmmmm把稀疏的矩阵转dense，空的地方填0之后，效果不差

In [21]:
%%time
clf = GaussianNB()
print(clf.fit(train.toarray(), train_label))

GaussianNB(priors=None)
Wall time: 21.8 s


In [22]:
%%time
predict = clf.predict(test.toarray())
print("f1 score: ", f1_score(test_label, predict, labels=range(len(sheets)), average='micro'))

f1 score:  0.869706840391
Wall time: 21.5 s
