## 1. 导入数据集

In [1]:
import pandas as pd

In [2]:
data_train = pd.read_csv('./train.tsv', sep='\t')
data_test = pd.read_csv('./test.tsv', sep='\t')

In [3]:
data_train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


电影评论的情感标签包括5类，分别是：
+ 0 - negative
+ 1 - somewhat negative
+ 2 - neutral
+ 3 - somewhat positive
+ 4 - positive

In [4]:
data_train.shape

(156060, 4)

In [5]:
data_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [6]:
data_test.shape

(66292, 3)

## 2. 构建语料库

In [8]:
train_sentences = data_train['Phrase']
test_sentences = data_test['Phrase']

In [9]:
# concat train and test data\
sentences = pd.concat([train_sentences, test_sentences])

In [10]:
sentences.shape

(222352,)

In [12]:
label = data_train["Sentiment"]

In [13]:
label.shape

(156060,)

In [14]:
# import stopwords
stop_words = open('./stop_words.txt', encoding='utf-8').read().splitlines()

In [15]:
stop_words

["\ufeffain'",
 'happy',
 'isn',
 'ain',
 'al',
 'couldn',
 'didn',
 'doesn',
 'hadn',
 'hasn',
 'haven',
 'sn',
 'll',
 'mon',
 'shouldn',
 've',
 'wasn',
 'weren',
 'won',
 'wouldn',
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'t",
 "'ve",
 'ZT',
 'ZZ',
 'a',
 "a's",
 'able',
 'about',
 'above',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'added',
 'adj',
 'adopted',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'again',
 'against',
 'ah',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'apparently',
 'appear',
 'appreciate',
 'appropriate',
 'approximately',
 'are',
 'area',
 'areas',
 'aren',
 "aren't",
 'arent',
 'arise',
 'around',
 'as',
 'aside',
 'ask',
 'asked',
 'asking',
 

## 3. 使用词袋模型进行文本特征工程

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
co = CountVectorizer(
    analyzer='word',
    ngram_range=(1,4),
    stop_words=stop_words,
    max_features=150000
)

In [17]:
# 构建词袋模型
co.fit(sentences)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=150000, min_df=1,
        ngram_range=(1, 4), preprocessor=None,
        stop_words=["\ufeffain'", 'happy', 'isn', 'ain', 'al', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'sn', 'll', 'mon', 'shouldn', 've', 'wasn', 'weren', 'won', 'wouldn', "'d", "'ll", "'m", "'re", "'s", "'t", "'ve", 'ZT', 'ZZ', 'a', "a's", 'able', 'about', 'above', 'abst', 'accordance', 'accor...', ',', '·', '￥', '……', '（', '）', '——', '、', '：', '；', '“', '’', '《', '》', '，', '。', '、', '？', '★ '],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [33]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(train_sentences,label,random_state=1234)

In [34]:
x_train[1]

'A series of escapades demonstrating the adage that what is good for the goose'

In [35]:
x_train[2]

'A series'

In [36]:
# 用上面构建的词袋模型，把训练集和验证集中的每一个词都进行特征工程，变成向量
x_train = co.transform(x_train)
x_test = co.transform(x_test)

In [37]:
x_train[1]

<1x150000 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

## 4. 构建分类器算法模型

In [38]:
# Logistic Classification
import warnings
warnings.filterwarnings('ignore')

In [39]:
from sklearn.linear_model import LogisticRegression
lg1 = LogisticRegression()
lg1.fit(x_train,y_train)
print('词袋方法进行文本特征工程，使用sklearn默认的逻辑回归分类器，验证集上的预测准确率：',lg1.score(x_test,y_test))

词袋方法进行文本特征工程，使用sklearn默认的逻辑回归分类器，验证集上的预测准确率： 0.6430603613994618


In [40]:
# 多项式朴素贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB
classifer = MultinomialNB()
classifer.fit(x_train,y_train)
print('词袋方法进行文本特征工程，使用sklearn默认的多项式朴素贝叶斯分类器，验证集上的预测准确率为：',classifer.score(x_test,y_test))

词袋方法进行文本特征工程，使用sklearn默认的多项式朴素贝叶斯分类器，验证集上的预测准确率为： 0.6084070229398949


## 5. 使用TF-IDF模型进行文本特征工程

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1,4),
    max_features=150000
)

In [42]:
tf.fit(sentences)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=150000, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [43]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_sentences,label,random_state=1234)

In [44]:
x_train = tf.transform(x_train)
x_test = tf.transform(x_test)

In [45]:
x_train[1]

<1x150000 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

## 6. TF-IDF构建特征工程后构建分类器

In [47]:
classifer = MultinomialNB()
classifer.fit(x_train,y_train)
print('TF-IDF方法进行文本特征工程，使用sklearn默认的多项式朴素贝叶斯分类器，验证集上的预测准确率：',classifer.score(x_test,y_test))

TF-IDF方法进行文本特征工程，使用sklearn默认的多项式朴素贝叶斯分类器，验证集上的预测准确率： 0.6045367166474432


In [48]:
# 逻辑回归分类器
lg1 = LogisticRegression(C=3,dual=True)
lg1.fit(x_train,y_train)
print('TF-IDF方法进行文本特征工程，使用增加了两个参数后，验证集上的预测准确率：',lg1.score(x_test,y_test))

TF-IDF方法进行文本特征工程，使用增加了两个参数后，验证集上的预测准确率： 0.6533384595668332


In [49]:
# 使用网格搜索来寻找最优参数
from sklearn.model_selection import GridSearchCV
param_grid = {"C":range(1,10),
             "dual":[True,False]}
lgGS = LogisticRegression()
grid = GridSearchCV(lgGS,param_grid=param_grid,cv=3,n_jobs=-1)
grid.fit(x_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': range(1, 10), 'dual': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [50]:
grid.best_params_

{'C': 5, 'dual': True}

In [51]:
lg_final = grid.best_estimator_

In [52]:
print('经过网格搜索，肇东最优超参数组合对应的逻辑回归，在验证集上的预测准确度：',lg_final.score(x_test,y_test))

经过网格搜索，肇东最优超参数组合对应的逻辑回归，在验证集上的预测准确度： 0.6546456491093169


## 7. 对测试集的数据进行预测，提交Kaggle竞赛最终结果

In [53]:
data_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [54]:
test_X = tf.transform(data_test['Phrase'])

In [55]:
prediction = lg_final.predict(test_X)

In [56]:
prediction

array([2, 2, 3, ..., 1, 1, 2])

In [57]:
prediction.shape

(66292,)

In [58]:
data_test.loc[:,'Sentiment'] = prediction

In [59]:
data_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,2
1,156062,8545,An intermittently pleasing but mostly routine ...,2
2,156063,8545,An,3
3,156064,8545,intermittently pleasing but mostly routine effort,2
4,156065,8545,intermittently pleasing but mostly routine,2


In [63]:
final_data = data_test.loc[:,['PhraseId','Sentiment']]

In [65]:
final_data.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,3
3,156064,2
4,156065,2


In [67]:
# save file to csv
final_data.to_csv('submission.csv', index=None)