In [49]:
import pyprind
import pandas as pd
import os
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier

import nltk

### load data

In [5]:
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = './aclImdb/{}/{}'.format(s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as f:
                txt = f.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
            
df.columns = ['review', 'sentiment']

df.head()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:58


Unnamed: 0,review,sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


In [8]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)

df = pd.read_csv('./movie_data.csv') 

print(np.shape(df))
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,"The premise of this movie, of a comedian talk ...",0
1,I first remember bumping into this zaniness fr...,1
2,First of all I saw this movie without knowing ...,1
3,There must be an error. This movie belongs wit...,0
4,Alter Egos do not come funnier than the creati...,1


### transform to feature vectors

In [14]:
##############################
# ex
count = CountVectorizer()
docs = np.array(['The sun is shining', 'The weather is sweet', 'The sun is shining and weather is sweet'])
bag = count.fit_transform(docs)

count.vocabulary_

{'and': 0, 'is': 1, 'shining': 2, 'sun': 3, 'sweet': 4, 'the': 5, 'weather': 6}

In [15]:
bag.toarray()

array([[0, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1],
       [1, 2, 1, 1, 1, 1, 1]], dtype=int64)

In [16]:
tfidf = TfidfTransformer()
np.set_printoptions(precision=3)
tfidf.fit_transform(count.fit_transform(docs)).toarray()
############################

array([[0.   , 0.434, 0.558, 0.558, 0.   , 0.434, 0.   ],
       [0.   , 0.434, 0.   , 0.   , 0.558, 0.434, 0.558],
       [0.445, 0.525, 0.338, 0.338, 0.338, 0.263, 0.338]])

In [23]:
def preprocessor(s):
    s = re.sub('<[^>]*>', '', s)
    emojis = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', s)
    s = re.sub('[\W]+', ' ', s)
    s = s.lower() + ''.join(emojis).replace('-', '')
    
    return s

# test
preprocessor('</a>This is :) a =( test :-)')

'this is a test :)=(:)'

In [24]:
df['review'] = df['review'].apply(preprocessor)
df.head()

Unnamed: 0,review,sentiment
0,the premise of this movie of a comedian talk s...,0
1,i first remember bumping into this zaniness fr...,1
2,first of all i saw this movie without knowing ...,1
3,there must be an error this movie belongs with...,0
4,alter egos do not come funnier than the creati...,1


### tokenize, stem and remove stop-words

In [37]:
def tokenize(s):
    return s.split()

#test
test_l = tokenize('runners like running and thus they run')
test_l

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [38]:
def tokenize_and_stem(s):
    porter = nltk.stem.porter.PorterStemmer()
    return [porter.stem(word) for word in s.split()]

#test
test_l = tokenize_and_stem('runners like running and thus they run')
test_l

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/jj/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [32]:
stop = nltk.corpus.stopwords.words('english')

# test
[w for w in test_l if w not in stop]

['runner', 'like', 'run', 'thu', 'run']

### training

In [33]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [41]:
params = [{'vect__ngram_range': [(1,1)],
          'vect__stop_words': [stop, None],
          'vect__tokenizer': [tokenize, tokenize_and_stem],
          'clf__penalty': ['l1','l2'],
          'clf__C': [1.0, 10.0, 100.0]
          },
          {'vect__ngram_range': [(1,1)],
          'vect__stop_words': [stop, None],
          'vect__tokenizer': [tokenize, tokenize_and_stem],
          'vect__use_idf': [False],
          'vect__norm': [None],
          'clf__penalty': ['l1','l2'],
          'clf__C': [1.0, 10.0, 100.0]
         }]

lr_tfidf = Pipeline([('vect', TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)),
                    ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, params,
                          scoring='accuracy',
                          cv=5,
                          n_jobs=-1,
                          verbose=1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 54.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='ac

In [42]:
gs_lr_tfidf.best_params_

{'clf__C': 10.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenize>}

In [44]:
# train accuracy
gs_lr_tfidf.best_score_

0.8922443102275909

In [46]:
# test accuracy
clf = gs_lr_tfidf.best_estimator_
clf.score(X_test, y_test)

0.89992

### online & out-of-core learning

In [48]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

#test
next(stream_docs('./movie_data.csv'))

('"The premise of this movie, of a comedian talk show host running for president as an independent just to shake things up, is funny, entertaining, brilliant and even a bit inspiring. (thought about the west wing debate when Tom Dobbs leaves his podium, thought about Steven Colbert announcing his candidacy, good times) The first 15 - 20 minutes of this movie are therefore very very entertaining, the debate especially. When he eventually get\'s elected, it\'s a pity that is because of a computer glitch, you\'d want him to win fair (although that is unrealistic).<br /><br />But after that this movie goes completely downhill. I thought we\'d get a great movie like \'Dave\' (1993) in which we see how it would out if a comedian actually ran the country. Instead, the movie turns from comedy into a thriller, a romantic comedy and a drama and does none good. The computer glitch becomes the main storyline, which really sucks. Boy is this disappointing. I give it 3 stars just for the premise and

In [52]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

#test
get_minibatch(stream_docs('./movie_data.csv'), 5)

(['"The premise of this movie, of a comedian talk show host running for president as an independent just to shake things up, is funny, entertaining, brilliant and even a bit inspiring. (thought about the west wing debate when Tom Dobbs leaves his podium, thought about Steven Colbert announcing his candidacy, good times) The first 15 - 20 minutes of this movie are therefore very very entertaining, the debate especially. When he eventually get\'s elected, it\'s a pity that is because of a computer glitch, you\'d want him to win fair (although that is unrealistic).<br /><br />But after that this movie goes completely downhill. I thought we\'d get a great movie like \'Dave\' (1993) in which we see how it would out if a comedian actually ran the country. Instead, the movie turns from comedy into a thriller, a romantic comedy and a drama and does none good. The computer glitch becomes the main storyline, which really sucks. Boy is this disappointing. I give it 3 stars just for the premise an

In [53]:
vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21, 
                         preprocessor=None, 
                         tokenizer=tokenize_and_stem)
clf = SGDClassifier(loss='log', 
                    random_state=1, 
                    n_iter=1)
doc_stream = stream_docs('./movie_data.csv')

In [54]:
batches = 45
pbar = pyprind.ProgBar(batches)
classes = np.array([0,1])
for _ in range(batches):
    X_train, y_train = get_minibatch(doc_stream, 1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:46


In [59]:
# test accuracy
X_test, y_test = get_minibatch(doc_stream, 5000)
X_test = vect.transform(X_test)
clf.score(X_test, y_test)

0.8214