In [1]:
import os
import codecs
import jieba
import re

from sklearn.utils import shuffle

In [89]:
category = ['星座', '股票', '房产', '时尚', '体育', '社会', '家居', '游戏', '彩票', '科技', '教育', '时政', '娱乐', '财经']

# 每篇文档保留的文档数量
#per_class_max_docs = 1000

def load_data_to_mini(path, to_path, per_class_max_docs=1000):
    """
    处理清华大学语料库，将类别和文档处理成fasttext 所需要的格式
    :param path: 
    :param to_path: 
    :return: 
    """
    # 抽取后的语料库
    corpus = []
    if not os.path.isdir(path):
        print('path error')
    # 列举当前目录下的所有子列别目录
    with codecs.open(to_path, 'w+',encoding='utf-8',errors='ignore') as f:
        for files in os.listdir(path):
            curr_path = os.path.join(path, files)
            print(curr_path)
            if os.path.isdir(curr_path):
                count = 0
                docs = []
                for file in os.listdir(curr_path):
                    count += 1
                    if count > per_class_max_docs:
                        break
                    file_path = os.path.join(curr_path, file)
                    
                    # 读取文件中的内容
                    with codecs.open(file_path, 'r', encoding='utf-8',errors='ignore') as fd:
                        docs.append('__label__' + files + ' ' + ' '.join(jieba.cut(re.sub('[  \n\r\t]+', '', fd.readline()))))
                        s = ' '.join(jieba.cut(re.sub('[  \n\r\t]+', '', fd.read())))
                        f.write('__label__' + files + ' ' + s+'\n')
            corpus.append(docs)
#     # 将数据写到一个新的文件中
#     with codecs.open(to_path, 'a+', encoding='utf-8',errors='ignore') as f:
#         for docs in corpus:
#             for doc in docs:
#                 f.write(str(doc)+'\n')
#                 print(doc)
    return corpus


In [91]:
corpus = load_data_to_mini('../data/THUCNews', 'thu_data_all.txt', 1000)

../data/THUCNews\体育
../data/THUCNews\娱乐
../data/THUCNews\家居
../data/THUCNews\彩票
../data/THUCNews\房产
../data/THUCNews\教育
../data/THUCNews\时尚
../data/THUCNews\时政
../data/THUCNews\星座
../data/THUCNews\游戏
../data/THUCNews\社会
../data/THUCNews\科技
../data/THUCNews\股票
../data/THUCNews\财经


In [92]:
print('corpus size(%d,%d)' %(len(corpus), len(corpus[0])))
corpus[0][1]

corpus size(14,1000)


'__label__体育 商瑞华 首战 复仇 心切 中国 玫瑰 要 用 美国 方式 攻克 瑞典'

In [112]:
def split_data_with_label(corpus):
    """
    将数据划分为训练数据和样本标签
    :param corpus: 
    :return: 
    """
    input_x = []
    input_y = []

    tag = []
    if os.path.isfile(corpus):
        with codecs.open(corpus, 'rb',encoding='utf-8') as f:
            for line in f:
                tag.append(line)

    else:
        for docs in corpus:
            for doc in docs:
                tag.append(doc)
    tag = shuffle(tag)
    for doc in tag:
#         print(doc)
        index = doc.find(' ')
        input_y.append(doc[:index])
        input_x.append(doc[index + 1 :])

    # 打乱数据，避免在采样的时候出现类别不均衡现象
    # datasets = np.column_stack([input_x, input_y])
    # np.random.shuffle(datasets)
    # input_x = []
    # input_y = []
    # for i in datasets:
    #     input_x.append(i[:-1])
    #     input_y.append(i[-1:])
    return [input_x, input_y]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.scorer import make_scorer
from sklearn import linear_model
from sklearn import metrics

from time import time

def feature_extractor(input_x, case='tfidf', max_df=1.0, min_df=0.0):
    """
    特征抽取
    :param corpus: 
    :param case: 不同的特征抽取方法
    :return: 
    """
    return TfidfVectorizer(token_pattern='\w', ngram_range=(1,2), max_df=max_df, min_df=min_df).fit_transform(input_x)

# 拆分数据集
def split_data_to_train_and_test(corpus, indices=0.2, random_state=10, shuffle=True):
    """
    将数据划分为训练数据和测试数据
    :param corpus: [input_x]
    :param indices: 划分比例
    :random_state: 随机种子
    :param shuffle: 是否打乱数据
    :return: 
    """
    input_x, y = corpus

    # 切分数据集
    x_train, x_dev, y_train, y_dev = train_test_split(input_x, y, test_size=indices, random_state=10)
    print("Vocabulary Size: {:d}".format(input_x.shape[1]))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, x_dev, y_train, y_dev

### 模型训练

In [124]:
def fit_and_predicted(train_x, train_y, test_x, test_y, penalty='l2', C=1.0, solver='lbfgs'):
    """
    训练与预测
    :param train_x: 
    :param train_y: 
    :param test_x: 
    :param test_y: 
    :return: 
    """
    clf = linear_model.LogisticRegression(penalty=penalty, C=C, solver=solver, n_jobs=-1).fit(train_x, train_y)
    predicted = clf.predict(test_x)
    print(metrics.classification_report(test_y, predicted))
    print('accuracy_score: %0.5fs' %(metrics.accuracy_score(test_y, predicted)))

# 1. 加载语料
corpus = split_data_with_label('thu_data_all.txt')

input_x, y = corpus
# 2. 特征选择
input_x = feature_extractor(input_x, 'tfidf')
# 3.切分训练数据和测试数据
train_x, test_x, train_y, test_y = split_data_to_train_and_test([input_x, y])

# 4. 训练以及测试
t0 = time()
print('\t\t使用 max_df,min_df=(1.0,0.0) 进行特征选择的逻辑回归文本分类\t\t')

fit_and_predicted(train_x, train_y, test_x, test_y)
print('time uesed: %0.4fs' %(time() - t0))

Vocabulary Size: 917401
Train/Dev split: 11200/2801
		使用 max_df,min_df=(1.0,0.0) 进行特征选择的逻辑回归文本分类		
              precision    recall  f1-score   support

                   0.00      0.00      0.00         1
 __label__体育       1.00      0.99      0.99       211
 __label__娱乐       0.98      0.98      0.98       205
 __label__家居       0.88      0.99      0.93       191
 __label__彩票       0.99      0.99      0.99       188
 __label__房产       0.94      0.95      0.95       192
 __label__教育       0.95      0.86      0.90       210
 __label__时尚       0.98      0.93      0.95       202
 __label__时政       0.89      0.93      0.91       220
 __label__星座       0.97      0.98      0.98       187
 __label__游戏       0.91      0.94      0.92       204
 __label__社会       0.88      0.89      0.89       228
 __label__科技       0.93      0.90      0.91       178
 __label__股票       0.95      0.89      0.91       200
 __label__财经       0.88      0.89      0.89       184

   micro avg       0.94      0.94  

  'precision', 'predicted', average, warn_for)


### 模型训练（加入交叉验证）

In [127]:
def fit_and_predicted_use_CV(train_x, train_y, test_x, test_y, penalty='l2', C=1.0, solver='lbfgs', cv=10):
    """
    训练与预测
    :param train_x: 
    :param train_y: 
    :param test_x: 
    :param test_y: 
    :return: 
    """
    clf = linear_model.LogisticRegressionCV(penalty=penalty, C=C, solver=solver, n_jobs=-1, cv=cv).fit(train_x, train_y)
    predicted = clf.predict(test_x)
    print(metrics.classification_report(test_y, predicted))
    print('accuracy_score: %0.5fs' %(metrics.accuracy_score(test_y, predicted)))
    


input_x, y = corpus
input_x = feature_extractor(input_x, 'tfidf')
# 切分训练数据和测试数据
train_x, test_x, train_y, test_y = split_data_to_train_and_test([input_x, y])
# 训练以及测试
t0 = time()
print('\t 使用 max_df,min_df=(1.0,0.0) 进行特征选择的逻辑回归文本分类\t\t\n')
fit_and_predicted(train_x, train_y, test_x, test_y)
print('time uesed: %0.4fs' %(time() - t0))
    

Vocabulary Size: 917401
Train/Dev split: 11200/2801
	 使用 max_df,min_df=(1.0,0.0) 进行特征选择的逻辑回归文本分类		

              precision    recall  f1-score   support

                   0.00      0.00      0.00         1
 __label__体育       1.00      0.99      0.99       211
 __label__娱乐       0.98      0.98      0.98       205
 __label__家居       0.88      0.99      0.93       191
 __label__彩票       0.99      0.99      0.99       188
 __label__房产       0.94      0.95      0.95       192
 __label__教育       0.95      0.86      0.90       210
 __label__时尚       0.98      0.93      0.95       202
 __label__时政       0.89      0.93      0.91       220
 __label__星座       0.97      0.98      0.98       187
 __label__游戏       0.91      0.94      0.92       204
 __label__社会       0.88      0.89      0.89       228
 __label__科技       0.93      0.90      0.91       178
 __label__股票       0.95      0.89      0.91       200
 __label__财经       0.88      0.89      0.89       184

   micro avg       0.94      0.94 