In [1]:
import pyprind
import pandas as pd
import os

#把电影评论读入pandas DataFrame对象
basepath = 'C:/Users/柠檬有点萌/Desktop/machine_learning/aclImdb'
labels = {'pos':1, 'neg':0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:13:18


In [3]:
import numpy as np

#对DataFrame进行洗牌
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')
#读取前三个实例来确认数据成功保存
df = pd.read_csv('movie_data.csv', encoding='utf-8')
print(df.head(3))

review  sentiment
0  In 1974, the teenager Martha Moxley (Maggie Gr...          1
1  OK... so... I really like Kris Kristofferson a...          0
2  ***SPOILER*** Do not read this, if you think a...          0


In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

#根据各单词在各文件中出现的频率以文本数据阵列构建词袋模型
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining,the weather is sweet,and one and one is two'])
bag = count.fit_transform(docs)
#显示词汇表的内容，即单词以及整数索引
print(count.vocabulary_)
#显示单词在文档中出现的次数
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [5]:
#词频逆反文档频率，tf-idfs格式，减少特征向量中频繁出现的词

from sklearn.feature_extraction.text import TfidfTransformer
#norm='l2'表示最后一步对tf-idf进行L2归一化
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [6]:
import re

def preprocessor(text):
    """使用正则表达式删除所有的标点符号，除了表情特征"""
    #删除HTML标记
    text = re.sub('<[^>]*>', '', text)
    #找到表情符号
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    #去除所有非单词字符
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

In [7]:
print(df.loc[0, 'review'][-50:])
print(preprocessor(df.loc[0, 'review'][-50:]))

#使用函数清洗所有电影评论
df['review'] = df['review'].apply(preprocessor)

is seven.<br /><br />Title (Brazil): Not Available
is seven title brazil not available


In [8]:
#去除电影评论中的停用词

import nltk
nltk.download('stopwords')

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

from nltk.corpus import stopwords
stop = stopwords.words('English')
# print([w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop])

[nltk_data] Error loading stopwords: [WinError 10051]
[nltk_data]     向一个无法连接的网络尝试了一个套接字操作。


BadZipFile: File is not a zip file

In [9]:
#接下来训练一个逻辑回归模型来把电影评论分类为正面和负面

In [10]:
#将清理过的文本文档分成25000个训练文档和25000个测试文档
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [11]:
#采用5倍分层交叉验证方法，为逻辑回归模型寻找最佳的参数集
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

param_grid = [
    {'vect__ngram_range':[(1,1)], 
    'vect__stop_words':[stop, None], 
    'vect__tokenizer':[tokenizer, tokenizer_porter], 
    'clf__penalty':['l1', 'l2'], 
    'clf__C':[1.0, 10.0, 100.0]}, 
    {'vect__ngram_range':[(1,1)], 
    'vect__stop_words':[stop, None], 
    'vect__tokenizer':[tokenizer, tokenizer_porter], 
    'vect__use_idf':[False], 'vect__norm':[None], 
    'clf__penalty':['l1','l2'], 
    'clf__C':[1.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

NameError: name 'stop' is not defined

In [12]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('\n')
print('CV Acurracy: %.3f' % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_
print('\nTest Acurracy: %.3f' % clf.score(X_test, y_test))

NameError: name 'gs_lr_tfidf' is not defined

In [23]:
#主题建模技术为无标签文本文档分配主题，潜在狄氏分配LDA

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)
X = count.fit_transform(df['review'].values)

#拟合应用于词袋矩阵的评估器
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')
X_topics = lda.fit_transform(X)

print(lda.components_.shape)

(10, 5000)


In [24]:
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i] for i in topic.argsort()\
        [:-n_top_words - 1:-1]]))

Topic 1:
horror worst effects budget awful
Topic 2:
watched guy maybe ll wasn
Topic 3:
war american documentary history series
Topic 4:
audience cinema human feel art
Topic 5:
kids comedy episode school series
Topic 6:
woman house wife father horror
Topic 7:
role performance comedy music actor
Topic 8:
series western star action john
Topic 9:
book version original read novel
Topic 10:
action animation disney fight dvd


In [2]:
#创建目录以存储网络应用文件和数据

In [None]:
import pickle
import os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)