# Movie Sentiment Analysis
https://www.kaggle.com/c/word2vec-nlp-tutorial/

 拿到数据首先读入拿到数据

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt # 画图常用库

import pandas as pd


train = pd.read_csv('../input/labeledTrainData.tsv', delimiter="\t")
test = pd.read_csv('../input/testData.tsv', delimiter="\t")
train.head()                

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [2]:
# test data比如train data少了label的一维
print (train.shape)
print (test.shape)

(25000, 3)
(25000, 2)


In [3]:
'''
    清理数据，文本中包含HTML的符号比如<>，我们使用正则表达式简单地清理一下
'''
import re  #正则表达式

def review_preprocessing(review):
    #只保留英文单词
    review_text = re.sub("[^a-zA-Z]"," ", review)
    
    #变成小写
    words = review_text.lower()
    
    return(words)

# 把训练集的文本和标注分开
# 1. 把标注提取出来
full_train_y = train['sentiment']

# 2. 把文本提取出来
full_train_x = []
for review in train['review']:
    full_train_x.append(review_preprocessing(review))
    
# 3. 转化成numpy数组        
full_train_x = np.array(full_train_x)

# 对校验集的文本做同样的事情


# 对测试集的文本做同样的事情
test_data = []
for review in test['review']:
    test_data.append(review_preprocessing(review))
    
test_data = np.array(test_data)

print(full_train_x.shape)
print(test_data.shape)

(25000,)
(25000,)


In [4]:
from sklearn.model_selection import train_test_split

# train_data => split(train_set, validation_set) => 选出好的模型 => optional(retrain_on_fulldata) => 训练好的模型 => test 
data_train, data_validation, labels_train, labels_validation = train_test_split(
    full_train_x,
    full_train_y, 
    test_size=0.2, 
    random_state=0) 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

# 使用简单的计数
vectorizer = CountVectorizer()
data_train_count = vectorizer.fit_transform(data_train)
data_validation_count  = vectorizer.transform(data_validation)

print("Let's go!")

Let's go!


In [10]:
# 使用tf-idf
tfidf_vectorizer = TfidfVectorizer()


tfidf_data_train_count = tfidf_vectorizer.fit_transform(data_train)
tfidf_data_validation_count  = tfidf_vectorizer.transform(data_validation)

In [11]:
# 使用tf-idf with n-gram and stop words
advance_tfidf_vectorizer = TfidfVectorizer(
           ngram_range=(1, 3),  # 二元文法模型
           stop_words = 'english') # 去掉英文停用词


advance_tfidf_data_train_count = advance_tfidf_vectorizer.fit_transform(data_train)
advance_tfidf_data_validation_count  = advance_tfidf_vectorizer.transform(data_validation)

In [12]:
# 多项式朴素贝叶斯
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score

clf = MultinomialNB()

# 用Validation Set选择模型
clf.fit(data_train_count, labels_train)
pred = clf.predict(data_validation_count)
print ("Counter Accuracy:", accuracy_score(labels_validation, pred))

clf.fit(tfidf_data_train_count, labels_train)
tfidf_pred = clf.predict(tfidf_data_validation_count)
print ("TFIDF Accuracy:", accuracy_score(labels_validation, tfidf_pred))

clf.fit(advance_tfidf_data_train_count, labels_train)
advance_tfidf_pred = clf.predict(advance_tfidf_data_validation_count)
print ("Advance TFIDF Accuracy:", accuracy_score(labels_validation, advance_tfidf_pred))

Counter Accuracy: 0.8624
TFIDF Accuracy: 0.8762
Advance TFIDF Accuracy: 0.8846


In [13]:
# 全量训练
full_data_train_count = advance_tfidf_vectorizer.fit_transform(full_train_x)
data_test_count  = advance_tfidf_vectorizer.transform(test_data)
clf.fit(full_data_train_count, full_train_y)

# 最终测试
pred = clf.predict(data_test_count)
print(len(pred))

25000


In [14]:
# 把结果保存到csv文件中，并进行提交: https://www.kaggle.com/c/word2vec-nlp-tutorial/leaderboard
df = pd.DataFrame({"id": test['id'],"sentiment": pred})

df.to_csv('submission.csv',index = False, header=True)