In [1]:
'''
实验目的：实现基于logistic/softmax regression的文本分类

实验内容：
1）简单文本特征表示（词频、n-gram、TF-IDF）、特征组合与特征选择
2）模型相关：（随机）梯度下降、损失函数、学习率
3）实际操作：训练集/验证集/测试集的划分、shuffle

补充：
1）更加复杂的句子级特征暂时没有构建
2）可以使用纯numpy实现，比较懒所以用现成的库了

参考：
https://scikit-learn.org/stable/
http://sklearn.apachecn.org/#/
https://zhuanlan.zhihu.com/p/37157010

'''

'\n实验目的：实现基于logistic/softmax regression的文本分类\n\n实验内容：\n1）简单文本特征表示（词频、n-gram、TF-IDF）、特征组合与特征选择\n2）模型相关：（随机）梯度下降、损失函数、学习率\n3）实际操作：训练集/验证集/测试集的划分、shuffle\n\n补充：\n1）更加复杂的句子级特征暂时没有构建\n2）可以使用纯numpy实现，比较懒所以用现成的库了\n\n参考：\nhttps://scikit-learn.org/stable/\nhttp://sklearn.apachecn.org/#/\nhttps://zhuanlan.zhihu.com/p/37157010\n\n'

In [2]:
import os
import torch
import numpy as np
import pandas as pd
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import hstack
from sklearn.linear_model import SGDClassifier

dir_all_data='data\\task1_all_data.tsv'
#dir_all_data='D:\\workspace\\nlp_beginer_solution\\Task1\\data\\task1_all_data.tsv'

#### 读取数据与数据处理

In [3]:
#读取数据 
data_all=pd.read_csv(dir_all_data,sep='\t')
print("data_all.shape: ",data_all.shape)    #(156060, 4)
print("data_all.keys: ",data_all.keys())   #['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']
print(data_all.head(2))

data_all.shape:  (156060, 4)
data_all.keys:  Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')
   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   

   Sentiment  
0          1  
1          2  


In [4]:
#取出要处理的列
x_all=data_all['Phrase']
y_all=data_all['Sentiment']
print(x_all.shape)   #(156060,)

(156060,)


In [5]:
#划分验证集、测试集
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,  y_test = train_test_split(x_all,y_all, test_size=0.2)
x_train, x_val,y_train,y_val = train_test_split(x_train,y_train, test_size=0.25)
print(x_train.shape, x_val.shape, x_test.shape)   #(93636,) (31212,) (31212,)

(93636,) (31212,) (31212,)


####  接下来要提取几个特征：文本计数特征、word级别的TF-IDF特征、ngram级别的TF-IDF特征      
#### Then---------->合并特征

In [6]:
#提取文本计数特征 -- 每个单词的数量
#对文本的单词进行计数，包括文本的预处理, 分词以及过滤停用词
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()  
x_train_counts = count_vect.fit_transform(x_train)
x_test_counts = count_vect.transform(x_test)
print(x_train_counts.shape,x_test_counts.shape)  #(93636, 15188) (31212, 15188)  矩阵(句子-词汇）的维度，词表大小15188

#在词汇表中一个单词的索引值对应的是该单词在整个训练的文集中出现的频率。
#print(count_vect.vocabulary_.get(u'good'))    #5812     count_vect.vocabulary_是一个词典：word-id

(93636, 15183) (31212, 15183)


In [7]:
#提取TF-IDF特征 -- word级别的TF-IDF
#将各文档中每个单词的出现次数除以该文档中所有单词的总数：这些新的特征称之为词频tf。
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_transformer = TfidfVectorizer(analyzer='word',max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_word = tfidf_transformer.transform(x_train)
x_test_tfidf_word = tfidf_transformer.transform(x_test)
print(x_train_tfidf_word.shape,x_test_tfidf_word.shape)

(93636, 15183) (31212, 15183)


In [8]:
#提取TF-IDF特征 - ngram级别的TF-IDF
#将各文档中每个单词的出现次数除以该文档中所有单词的总数：这些新的特征称之为词频tf。
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfVectorizer(analyzer='word',ngram_range=(2,3),max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_ngram = tfidf_transformer.transform(x_train)
x_test_tfidf_ngram = tfidf_transformer.transform(x_test)
print(x_train_tfidf_ngram.shape, x_test_tfidf_ngram.shape)

(93636, 50000) (31212, 50000)


In [9]:
#合并特征（特征组合与特征选择）
#from scipy.sparse import hstack
train_features=x_train_counts
test_features=x_test_counts

train_features = hstack([x_train_counts,x_train_tfidf_word, x_train_tfidf_ngram])
test_features = hstack([x_test_counts,x_test_tfidf_word ,x_test_tfidf_ngram])

print(train_features.shape)   #特征的最终维度


(93636, 80366)


#### 模型构建与训练

In [10]:
#训练分类器

#逻辑回归
# from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression(random_state=0, 
#                          solver='sag', #优化算法：liblinear、lbfgs、newton-cg、sag
#                          multi_class='multinomial' #分类方式：multinomial、ovr
# )

#朴素贝叶斯
#from sklearn.naive_bayes import MultinomialNB
#clf = MultinomialNB().fit(train_features, y_train)


#SGDClassifier是一系列采用了梯度下降来求解参数的算法的集合，默认是SVM
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(alpha=0.001,
                    loss='log',    #hinge代表SVM，log是逻辑回归
                    early_stopping=True,
                    eta0=0.001,
                    learning_rate='adaptive', #constant、optimal、invscaling、adaptive
                    max_iter=100 
                   )

In [11]:
#打乱数据，训练
from sklearn.utils import shuffle
train_features,y_train=shuffle(train_features,y_train )

clf.fit(train_features, y_train)


SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=True, epsilon=0.1, eta0=0.001, fit_intercept=True,
              l1_ratio=0.15, learning_rate='adaptive', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

#### 测试过程

In [12]:
#测试过程
predict = clf.predict(test_features)

In [13]:
#测试集的评估
print(np.mean(predict == y_test))

0.5453992054338075
