In [2]:
import numpy as np
import pandas as pd
import os

In [4]:
# parse files into single csv file

# labels = {'pos':1, 'neg':0}
# df = pd.DataFrame()
# for s in ('test', 'train'):
#     for l in ('pos', 'neg'):
#         path ='../../Downloads/aclImdb/%s/%s' % (s, l)
#         for file in os.listdir(path):
#             with open(os.path.join(path, file), 'r') as infile:
#                 txt = infile.read()
#                 df = df.append([[txt, labels[l]]], ignore_index=True)
# df.columns= ['review', 'sentiment']

# np.random.seed(0)
# df = df.sample(frac=1).reset_index(drop=True)
# df.to_csv('./movie_data.csv', index=False)

In [65]:
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"The premise of this movie, of a comedian talk ...",0
1,I first remember bumping into this zaniness fr...,1
2,First of all I saw this movie without knowing ...,1


In [66]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer() # ngram_range(min_kgrams, max_kgrams); (1,1) by default
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'
])
bag = count.fit_transform(docs)

print(count.vocabulary_)
print(count.get_feature_names())
print(bag.toarray())

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
['and', 'is', 'shining', 'sun', 'sweet', 'the', 'weather']
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [28]:
# tf-idf: increase weight of low freq words, and decrease weight of high freq words

# actual equation implemented by sklearn:
# tf = word_occurence_count_in_curr_doc
# df = ln(1+doc_n / 1+ docs_n_containing_word)
# tf-idf = tf * (df + 1)
# tf-idf arr of curr docgoes through N2-normalization (not related to N2 regularization), meaning standardized by dividing by sum of squares

from sklearn.feature_extraction.text import TfidfTransformer

count = CountVectorizer()
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())
print(count.get_feature_names())

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]
['and', 'is', 'shining', 'sun', 'sweet', 'the', 'weather']


In [125]:
# cleaning text data

import re # regex
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # <(open bracket) (any char except >)* >(close bracket)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # eyes(:/;/=) optional nose(-) mouth( ) / ( /D/P)
    text = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    text = re.sub('\W+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '') # remove non words(\W), add back emoticons
    return text

preprocessor('is seven.<br /><br />Title (Brazil): Not Available :-) =( ;P 1/10!!!')

'is seven title brazil not available 1 10  :) =( ;P'

In [134]:
df['review'] = df['review'].apply(preprocessor)

In [67]:
df.loc[0, 'review']

"The premise of this movie, of a comedian talk show host running for president as an independent just to shake things up, is funny, entertaining, brilliant and even a bit inspiring. (thought about the west wing debate when Tom Dobbs leaves his podium, thought about Steven Colbert announcing his candidacy, good times) The first 15 - 20 minutes of this movie are therefore very very entertaining, the debate especially. When he eventually get's elected, it's a pity that is because of a computer glitch, you'd want him to win fair (although that is unrealistic).<br /><br />But after that this movie goes completely downhill. I thought we'd get a great movie like 'Dave' (1993) in which we see how it would out if a comedian actually ran the country. Instead, the movie turns from comedy into a thriller, a romantic comedy and a drama and does none good. The computer glitch becomes the main storyline, which really sucks. Boy is this disappointing. I give it 3 stars just for the premise and because

In [142]:
# Tokenizer/Word Splitter
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [139]:
# Word Stemming (using the Porter Stemmer) - similar to lemmatization, but may created non-words
# Porter Stemmer: https://tartarus.org/martin/PorterStemmer/def.txt
# Note: in practice, stemming and lemmatization have little impact on the performance of text classification 

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [145]:
# Removing Stop Words (extremely common words, e.g. 'is', 'and')

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [156]:
from sklearn.model_selection import train_test_split
X = df['review'].values
y = df['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [179]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Warning: takes a LONG time to finish, especially tokenizer_porter; it would probably improve performance to store the porterized data into a variable and run 2 Grid Searches instead
# Note: I skipped testing for tokenizer_portal to save time

tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(random_state=0))
])

# for the 2 hashes, aim to test either use_idf + normalization OR not use_idf + no norm; but not crisscrossing
param_grid = [
    {
        'vect__ngram_range': [(1,1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer], #[tokenizer, tokenizer_porter],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0]
    },
    {
        'vect__ngram_range': [(1,1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer], #[tokenizer, tokenizer_porter],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0],
        'vect__use_idf': [False],
        'vect__norm': [None],
    }
]

# grid search - logistic regresion - 
gs_lr_tfidf = GridSearchCV(
    lr_tfidf, 
    param_grid,
    scoring='accuracy',
    cv=5, 
    verbose=1,
    n_jobs=-1
)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  7.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...nalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__use_idf': [False], 'vect__norm': [None]}],
       pre_dispatch='2*n_jobs', refit=True, return_tr

In [186]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('Best parameter set: %s ' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_

# Best grid search results using:
# - regular tokenizer without Porter stemming, 
# - no stop-word library 
# - with tf-idfs & word normalization
# - logistic regression classifier that uses L2 regularization 
# - regularization strength C=10.0

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x115db49d8>} 
Best parameter set: 0.8958 


In [191]:
# To save time without running GridSearch again:
# best lr_tfidf
clf = Pipeline([
    ('vect', TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None, ngram_range=(1,1), stop_words=None, tokenizer=tokenizer)),
    ('clf', LogisticRegression(random_state=0, penalty='l2', C=10.0))
])
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [192]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(estimator=clf,
                        X=X_train,
                        y=y_train,
                        cv=5,
                        n_jobs=1)
print('Training CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
print('Test CV accuracy: %.3f' % (clf.score(X_test, y_test)))

Training CV accuracy: 0.896 +/- 0.004
Test CV accuracy: 0.899


In [68]:
# Online algorithm

import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

# combines preprocesser with tokenizer
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text) # <(open bracket) (any char except >)* >(close bracket)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # eyes(:/;/=) optional nose(-) mouth( ) / ( /D/P)
    text = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    text = re.sub('\W+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '') # remove non words(\W), add back emoticons
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


# creates a generator, i.e. read one line every time its called
# see: https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2]) # line is 'review-label-\n' line[-2] thus retrieves the label
            yield text, label

In [69]:
generator = stream_docs(path='./movie_data.csv')
next(generator)

('"The premise of this movie, of a comedian talk show host running for president as an independent just to shake things up, is funny, entertaining, brilliant and even a bit inspiring. (thought about the west wing debate when Tom Dobbs leaves his podium, thought about Steven Colbert announcing his candidacy, good times) The first 15 - 20 minutes of this movie are therefore very very entertaining, the debate especially. When he eventually get\'s elected, it\'s a pity that is because of a computer glitch, you\'d want him to win fair (although that is unrealistic).<br /><br />But after that this movie goes completely downhill. I thought we\'d get a great movie like \'Dave\' (1993) in which we see how it would out if a comedian actually ran the country. Instead, the movie turns from comedy into a thriller, a romantic comedy and a drama and does none good. The computer glitch becomes the main storyline, which really sucks. Boy is this disappointing. I give it 3 stars just for the premise and

In [70]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [71]:
# HashingVectorizer does not require holding all text in memory
# Stochastic gradient descent considers only 1 random point while changing weights 
# unlike gradient descent which considers the whole training data. 
# As such SGD is much faster than gradient descent when dealing with large data sets
# https://towardsdatascience.com/how-to-make-sgd-classifier-perform-as-well-as-logistic-regression-using-parfit-cc10bca2d3c4

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(
    decode_error='ignore',
    n_features=2**21,
    preprocessor=None,
    tokenizer=tokenizer
)
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

In [72]:
import pyprind # for progress bar
pbar = pyprind.ProgBar(45)
classes= np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:26


In [73]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.869
