In [1]:
import pyprind

In [2]:
import pandas as pd

In [3]:
import os

In [4]:
pbar = pyprind.ProgBar(50000)

In [5]:
labels = {'pos':1, 'neg':0}

In [6]:
df = pd.DataFrame()

In [7]:
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path ='./aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:28


In [22]:
df.columns = ['review', 'sentiment']

In [23]:
import numpy as np

In [24]:
np.random.seed(0)

In [25]:
df = df.reindex(np.random.permutation(df.index))

In [26]:
df.to_csv('./movie_data.csv', index=False)

In [8]:
df = pd.read_csv('./movie_data.csv')

In [9]:
df.head(3)

Unnamed: 0,review,sentiment
0,"Election is a Chinese mob movie, or triads in ...",1
1,I was just watching a Forensic Files marathon ...,0
2,Police Story is a stunning series of set piece...,1


In [10]:
import numpy as np

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
count = CountVectorizer()

In [13]:
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'])

In [14]:
bag = count.fit_transform(docs)

In [15]:
print(count.vocabulary_)

{'shining': 2, 'and': 0, 'weather': 6, 'sun': 3, 'the': 5, 'is': 1, 'sweet': 4}


In [17]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

In [19]:
tfidf = TfidfTransformer()

In [20]:
np.set_printoptions(precision=2)

In [21]:
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [22]:
df.loc[0, 'review'][-50:]

'nd three more acting performances (including Yam).'

In [23]:
import re

In [24]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')
    return text

In [25]:
preprocessor(df.loc[0, 'review'][-50:])

'nd three more acting performances including yam '

In [26]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :):(:)'

In [27]:
df['review'] = df['review'].apply(preprocessor)

In [28]:
def tokenizer(text):
    return text.split()

In [29]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [30]:
from nltk.stem.porter import PorterStemmer

In [31]:
porter = PorterStemmer()

In [32]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [33]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [34]:
import nltk

In [35]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/heavenyu1982/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
from nltk.corpus import stopwords

In [37]:
stop = stopwords.words('english')

In [38]:
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [39]:
X_train = df.loc[:25000, 'review'].values

In [40]:
y_train = df.loc[:25000, 'sentiment'].values

In [41]:
X_test = df.loc[25000:, 'review'].values

In [42]:
y_test = df.loc[25000:, 'sentiment'].values

In [43]:
from sklearn.grid_search import GridSearchCV



In [44]:
from sklearn.pipeline import Pipeline

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

In [54]:
param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,
                                  tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,
                                   tokenizer_porter],
               'vect__use_idf': [False],
               'vect__norm': [None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
             ]

In [55]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf',
                      LogisticRegression(random_state=0))])

In [56]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=1,
                           n_jobs=-1)

In [57]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 22.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 109.2min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 140.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__tokenizer': [<function tokenizer at 0x113033b70>, <function tokenizer_porter at 0x11518af28>], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',..., 'wasn', 'weren', 'won', 'wouldn'], None], 'vect__use_idf': [False], 'clf__penalty': ['l1', 'l2']}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1

In [63]:
print('Best parameter set: %s' % gs_lr_tfidf.best_params_)

Best parameter set: {'vect__tokenizer': <function tokenizer at 0x113033b70>, 'vect__stop_words': None, 'vect__ngram_range': (1, 1), 'clf__C': 10.0, 'clf__penalty': 'l2'}


In [64]:
print('CV Accuracy: %f' % gs_lr_tfidf.best_score_)

CV Accuracy: 0.898644


In [65]:
model = gs_lr_tfidf.best_estimator_

In [66]:
print('Test Accuacy: %f' % model.score(X_test, y_test))

Test Accuacy: 0.896360
