## 1.加载数据

### 1.1 训练集

In [1]:
import pandas as pd

train_df = pd.read_csv('/workspace/data/sohu_train.txt', sep='\t', header=None)
train_df.columns = ['分类', '内容']
train_df.head()

Unnamed: 0,分类,内容
0,娱乐,《青蛇》造型师默认新《红楼梦》额妆抄袭（图） 凡是看过电影《青蛇》的人，都不会忘记青白二蛇的...
1,娱乐,６．１６日剧榜　＜最后的朋友＞　亮最后杀招成功登顶 《最后的朋友》本周的电视剧排行榜单依然只...
2,娱乐,超乎想象的好看《纳尼亚传奇２：凯斯宾王子》 现时资讯如此发达，搜狐电影评审团几乎人人在没有看...
3,娱乐,吴宇森：赤壁大战不会出现在上集 “希望《赤壁》能给你们不一样的感觉。”对于自己刚刚拍完的影片...
4,娱乐,组图：《多情女人痴情男》陈浩民现场耍宝 陈浩民：外面的朋友大家好，现在是搜狐现场直播，欢迎《...


In [2]:
for name, group in train_df.groupby(train_df.columns[0]):
    print(name,len(group))

体育 2000
健康 2000
女人 2000
娱乐 2000
房地产 2000
教育 2000
文化 2000
新闻 2000
旅游 2000
汽车 2000
科技 2000
财经 2000


### 1.2 测试集

In [3]:
test_df = pd.read_csv('/workspace/data/sohu_test.txt', sep='\t', header=None)
for name, group in test_df.groupby(test_df.columns[0]):
    print(name, len(group))

体育 1000
健康 1000
女人 1000
娱乐 1000
房地产 1000
教育 1000
文化 1000
新闻 1000
旅游 1000
汽车 1000
科技 1000
财经 1000


## 2.分词

### 2.1 加载停顿词

In [4]:
with open('/workspace/data/stopwords.txt', encoding='utf8') as file:
    line_list = file.readlines()
    stopword_list = [k.strip() for k in line_list]
    stopword_set = set(stopword_list)
    print('停顿词列表，即变量stopword_list中共有%d个元素' %len(stopword_list))
    print('停顿词集合，即变量stopword_set中共有%d个元素' %len(stopword_set))

停顿词列表，即变量stopword_list中共有1452个元素
停顿词集合，即变量stopword_set中共有1213个元素


### 2.2 使用jieba库制作分词结果列表cutWords_list

In [5]:
import jieba
import time

cutWords_list = []
startTime = time.time()
content_series = train_df['内容'].astype(str)
for i in range(len(content_series)):
    content = content_series.iloc[i]
    cutWords = [k for k in jieba.cut(content, True) if k not in stopword_set]
    if (i+1) % 3000 == 0:
        usedTime = time.time() - startTime
        print('前%d篇文章分词共花费%.2f秒' %(i+1, usedTime))
    cutWords_list.append(cutWords)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.502 seconds.
Prefix dict has been built succesfully.


前3000篇文章分词共花费12.34秒
前6000篇文章分词共花费29.44秒
前9000篇文章分词共花费35.39秒
前12000篇文章分词共花费43.45秒
前15000篇文章分词共花费49.67秒
前18000篇文章分词共花费56.70秒
前21000篇文章分词共花费69.91秒
前24000篇文章分词共花费75.45秒


### 2.3 保存分词结果列表cutWords_list到文本文件

In [6]:
txtFilePath = '/workspace/output/cutWords_list.txt'
with open(txtFilePath, 'w', encoding='utf8') as file:
    for cutWords in cutWords_list:
        file.write(' '.join(cutWords))
        file.write('\n')

### 2.4 从文本文件加载分词结果列表cutWords_list

In [7]:
txtFilePath = '/workspace/output/cutWords_list.txt'
with open(txtFilePath, 'r', encoding='utf8') as file:
    cutWords_list = [k.split(' ') for k in file.readlines()]

## 3.word2vec模型

### 3.1 word2vec模型实例化对象

In [27]:
from gensim.models import Word2Vec
startTime = time.time()
word2vec_model = Word2Vec(cutWords_list, size=200, iter=10, min_count=20)
usedTime = time.time() - startTime
print('形成word2vec模型共花费%.2f秒' %usedTime)

形成word2vec模型共花费176.25秒


### 3.2 通过word2vec对象的most_similar方法获取词义相近的次

In [31]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
word2vec_model.wv.most_similar('摄影')

[('摄影师', 0.5912884473800659),
 ('摄影家', 0.5197041034698486),
 ('摄影展', 0.4974055290222168),
 ('摄影机', 0.4763668477535248),
 ('摄影记者', 0.4559651017189026),
 ('摄影艺术', 0.45154887437820435),
 ('作曲', 0.44387632608413696),
 ('摄影奖', 0.4405069649219513),
 ('摄影棚', 0.43817073106765747),
 ('人体摄影', 0.4270936846733093)]

In [30]:
word2vec_model.most_similar(positive=['女人', '先生'], negative=['男人'], topn=1)

[('女士', 0.5280015468597412)]

### 3.3 使用pickle库保存 word2vec模型

In [36]:
import pickle 

pickleFilePath = '/workspace/output/word2vec_model.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(word2vec_model, file)

### 3.4 使用pickle库加载word2vec模型

In [8]:
import pickle 

pickleFilePath = '/workspace/output/word2vec_model.pickle'
with open(pickleFilePath, 'rb') as file:
    word2vec_model = pickle.load(file)

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


## 4.特征工程

### 4.1 每篇文章的内容表示成向量

In [32]:
import numpy as np

def get_contentVector(cutWords, word2vec_model):
    vector_list = [word2vec_model.wv[k] for k in cutWords if k in word2vec_model]
    contentVector = np.array(vector_list).mean(axis=0)
    return contentVector

In [62]:
import time

startTime = time.time()
contentVector_list = []
for i in range(len(cutWords_list)):
    cutWords = cutWords_list[i]
    if (i+1) % 3000 == 0:
        usedTime = time.time() - startTime
        print('前%d篇文章内容表示成向量共花费%.2f秒' %(i+1, usedTime))
    contentVector_list.append(get_contentVector(cutWords, word2vec_model))
X = np.array(contentVector_list)

前3000篇文章内容表示成向量共花费21.11秒
前6000篇文章内容表示成向量共花费53.62秒
前9000篇文章内容表示成向量共花费64.50秒
前12000篇文章内容表示成向量共花费79.25秒
前15000篇文章内容表示成向量共花费90.29秒
前18000篇文章内容表示成向量共花费102.94秒
前21000篇文章内容表示成向量共花费127.81秒
前24000篇文章内容表示成向量共花费136.92秒


### 4.2 使用ndarray对象的dump方法保存文章向量化结果X

In [63]:
txtFilePath = '/workspace/output/X.txt'
X.dump(txtFilePath)

### 4.3 使用numpy库的load方法加载文章向量化结果

In [10]:
import numpy as np
txtFilePath = '/workspace/output/X.txt'
X = np.load(txtFilePath)

## 5.模型训练

### 5.1 标签编码

In [11]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
y = labelEncoder.fit_transform(train_df['分类'])

### 5.2 检查特征矩阵和预测目标值

In [12]:
print(X.shape, y.shape)

(24000, 200) (24000,)


### 5.3 逻辑回归模型

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
logisticRegression_model = LogisticRegression()
logisticRegression_model.fit(train_X, train_y)
logisticRegression_model.score(test_X, test_y)

0.8033333333333333

### 5.4 使用pickle库保存逻辑回归模型

In [14]:
import pickle

pickleFilePath = '/workspace/output/logisticRegression_model.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(logisticRegression_model, file)

### 5.5 使用pickle库加载逻辑回归模型

In [15]:
import pickle

pickleFilePath = '/workspace/output/logisticRegression_model.pickle'
with open(pickleFilePath, 'rb') as file:
    logisticRegression_model = pickle.load(file)

### 6.模型评估

### 6.1 交叉验证

In [96]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv_split = ShuffleSplit(n_splits=5, train_size=0.7, test_size=0.2)
score_ndarray = cross_val_score(LogisticRegression(), X, y, cv=cv_split)
print(score_ndarray)

print(score_ndarray.mean())

[0.80041667 0.79833333 0.79645833 0.79041667 0.7975    ]
0.7966249999999999


### 6.2 混淆矩阵

#### 6.2.1 获取训练集文本内容向量化后的特征矩阵 

In [40]:
import pickle
import numpy as np
import pandas as pd
import jieba 
import time
pickleFilePath = '/workspace/output/word2vec_model.pickle'
with open(pickleFilePath, 'rb') as file:
    word2vec_model = pickle.load(file)
def get_featureMatrix(content_series):
    vector_list = []
    for content in content_series:
        vector = get_contentVector(jieba.cut(content, True), word2vec_model)
        vector_list.append(vector)
    featureMatrix = np.array(vector_list)
    return featureMatrix

test_df = pd.read_csv('sohu_test.txt', sep='\t', header=None)
test_df.columns = ['分类', '内容']
startTime = time.time()
featureMatrix = getVectorMatrix(test_df['内容'])
usedTime = time.time() - startTime
print('测试集文本内容向量化花费时间%.2f秒' %usedTime)

  after removing the cwd from sys.path.


测试集文本内容向量化花费时间119.32秒


#### 6.2.2 绘制混淆矩阵

In [41]:
from sklearn.metrics import confusion_matrix

pickleFilePath = '/workspace/output/logisticRegression_model.pickle'
with open(pickleFilePath, 'rb') as file:
    logisticRegression_model = pickle.load(file)
test_label_list = labelEncoder.transform(test_df['分类'])
predict_label_list = logisticRegression_model.predict(featureMatrix)
pd.DataFrame(confusion_matrix(test_label_list, predict_label_list), 
             columns=labelEncoder.classes_,
             index=labelEncoder.classes_ )

Unnamed: 0,体育,健康,女人,娱乐,房地产,教育,文化,新闻,旅游,汽车,科技,财经
体育,968,1,5,8,0,2,3,2,8,1,0,2
健康,0,827,58,2,3,14,5,45,6,1,12,27
女人,6,39,809,40,4,8,51,13,13,6,7,4
娱乐,5,0,52,803,0,2,109,13,7,1,8,0
房地产,3,0,8,2,897,3,4,17,16,3,7,40
教育,1,14,16,8,2,888,10,39,7,1,10,4
文化,4,6,62,161,8,16,619,48,37,9,30,0
新闻,13,32,22,15,25,56,55,591,29,10,49,103
旅游,4,8,26,6,20,1,36,29,844,8,11,7
汽车,5,4,4,2,2,6,1,6,13,929,9,19


### 6.3 报告表

In [42]:
from sklearn.metrics import precision_recall_fscore_support

def eval_model(test_label_list, predict_label_list, className_list):
    # 计算每个分类的Precision, Recall, f1, support
    p, r, f1, s = precision_recall_fscore_support(test_label_list, predict_label_list)
    # 计算总体的平均Precision, Recall, f1, support
    total_p = np.average(p, weights=s)
    total_r = np.average(r, weights=s)
    total_f1 = np.average(f1, weights=s)
    total_s = np.sum(s)
    res1 = pd.DataFrame({
        u'Label': className_list,
        u'Precision': p,
        u'Recall': r,
        u'F1': f1,
        u'Support': s
    })
    res2 = pd.DataFrame({
        u'Label': ['总体'],
        u'Precision': [total_p],
        u'Recall': [total_r],
        u'F1': [total_f1],
        u'Support': [total_s]
    })
    res2.index = [999]
    res = pd.concat([res1, res2])
    return res[['Label', 'Precision', 'Recall', 'F1', 'Support']]

eval_model(test_label_list, predict_label_list, labelEncoder.classes_)

Unnamed: 0,Label,Precision,Recall,F1,Support
0,体育,0.956522,0.968,0.962227,1000
1,健康,0.858775,0.827,0.842588,1000
2,女人,0.740165,0.809,0.773053,1000
3,娱乐,0.757547,0.803,0.779612,1000
4,房地产,0.881139,0.897,0.888999,1000
5,教育,0.873156,0.888,0.880516,1000
6,文化,0.673558,0.619,0.645128,1000
7,新闻,0.641694,0.591,0.615305,1000
8,旅游,0.833992,0.844,0.838966,1000
9,汽车,0.928072,0.929,0.928536,1000
