In [151]:
'''
实验目的：实现基于logistic/softmax regression的文本分类

实验过程：
分析不同的特征和特征组合、损失函数、学习率对最终分类性能的影响

包含的知识点：
1）简单文本特征表示（词频、n-gram、TF-IDF）
2）分类器相关，损失函数、（随机）梯度下降、特征组合与特征选择
3）实际训练技巧：训练集/验证集/测试集的划分、shuffle

补充：
1）更加复杂的句子级特征暂时没有构建
2）可以使用纯numpy实现，窥探其中细节

参考：
https://scikit-learn.org/stable/
http://sklearn.apachecn.org/#/
https://zhuanlan.zhihu.com/p/37157010

'''

'\n实验目的：使用逻辑回归 + softmax 实现文本分类\n\n实验过程：\n分析不同的特征、损失函数、学习率对最终分类性能的影响\n尝试不同的梯度下降方式：批量梯度下降、随机梯度下降、小批量梯度下降\n每一轮迭代前进行shuffle\n\n构建的特征：\n词袋(Bag-of-Word)、n-gram、TF-IDF\n\n参考：http://sklearn.apachecn.org/#/\n'

In [152]:
import os
import torch
import numpy as np
import pandas as pd
import sklearn


dir_all_data='D:\\workspace\\nlp_beginer\\data\\task1_all_data.tsv'

In [153]:
print(os.getcwd())
data_all=pd.read_csv(dir_all_data,sep='\t')
#print(all_data.shape)    #(156060, 4)
#print(all_data.keys())   #['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']
x_all=data_all['Phrase']
y_all=data_all['Sentiment']
print(x_all.shape)


D:\workspace\nlp_beginer
(156060,)


In [154]:
#划分验证集、测试集
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,  y_test = train_test_split(x_all,y_all, test_size=0.2)
x_train, x_val,y_train,y_test = train_test_split(x_train,y_train, test_size=0.25)
print(x_train.shape, x_val.shape, x_test.shape)

(93636,) (31212,) (31212,)


In [155]:
#提取文本计数特征
#对文本的单词进行计数，包括文本的预处理, 分词以及过滤停用词
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()  
x_train_counts = count_vect.fit_transform(x_train)
x_test_counts = count_vect.transform(x_test)
print(x_train_counts.shape)

#在词汇表中一个单词的索引值对应的是该单词在整个训练的文集中出现的频率。
print(count_vect.vocabulary_.get(u'good')) 


(93636, 15169)
5794


In [167]:
#提取TF-IDF特征-word
#将各文档中每个单词的出现次数除以该文档中所有单词的总数：这些新的特征称之为词频tf。
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfVectorizer(analyzer='word',max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_word = tfidf_transformer.transform(x_train)
x_test_tfidf_word = tfidf_transformer.transform(x_test)
print(x_train_tfidf_word.shape)


(93636, 15169)


In [168]:
#提取TF-IDF特征-ngram
#将各文档中每个单词的出现次数除以该文档中所有单词的总数：这些新的特征称之为词频tf。
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfVectorizer(analyzer='word',ngram_range=(2,3),max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_ngram = tfidf_transformer.transform(x_train)
x_test_tfidf_ngram = tfidf_transformer.transform(x_test)
print(x_train_tfidf_ngram.shape)

(93636, 50000)


In [170]:
#合并特征
from scipy.sparse import hstack
train_features=x_train_counts
test_features=x_test_counts

train_features = hstack([x_train_counts,x_train_tfidf_word, x_train_tfidf_ngram])
test_features = hstack([x_test_counts,x_test_tfidf_word ,x_test_tfidf_ngram])

train_features.shape



(93636, 80338)

In [225]:
#训练分类器

#逻辑回归
# from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression(random_state=0, 
#                          solver='sag', #优化算法：liblinear、lbfgs、newton-cg、sag
#                          multi_class='multinomial' #分类方式：multinomial、ovr
# )

#朴素贝叶斯
#from sklearn.naive_bayes import MultinomialNB
#clf = MultinomialNB().fit(train_features, y_train)

from sklearn.linear_model import SGDClassifier
#SGDClassifier是一系列采用了梯度下降来求解参数的算法的集合，默认是SVM
clf = SGDClassifier(alpha=0.001,
                    loss='log',    #hinge代表SVM，log是逻辑回归
                    early_stopping=True,
                    eta0=0.001,
                    learning_rate='adaptive', #constant、optimal、invscaling、adaptive
                    max_iter=100 
                   )




In [230]:
#打乱数据，训练
from sklearn.utils import shuffle
train_features,y_train=shuffle(train_features,y_train )

clf.fit(train_features, y_train)


SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=True, epsilon=0.1, eta0=0.001, fit_intercept=True,
              l1_ratio=0.15, learning_rate='adaptive', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [231]:
#测试过程
predict = clf.predict(test_features)

In [232]:
#测试集的评估
print(np.mean(predict == y_test))

0.4651416122004357
