In [4]:
import tarfile

In [5]:
with tarfile.open('./data/aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()

In [6]:
!pip install pyprind



In [7]:
import pyprind
import pandas as pd
import os

basepath = 'aclImdb'

labels = {'pos': 1,
          'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
            
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:59


In [8]:
import numpy as np

np.random.seed(42)

df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [9]:
df = pd.read_csv('./data/movie_data.csv', encoding='utf-8')
df.head(10)

Unnamed: 0,review,sentiment
0,I was taken to this film by a friend and was s...,1
1,This trash version of `Romeo and Juliet' passe...,1
2,"There is a lot to like in this film, despite i...",1
3,"People have often been uncomfortable with ""The...",1
4,I don't get this. The movie obviously has a pr...,0
5,I opted to see the film at the recent Dubai Fi...,0
6,Jamie Foxx does a fine job of impersonating th...,1
7,The oddly-named Vera-Ellen was to movie dancin...,0
8,"First of all, the entire script is mostly impr...",1
9,A great gangster film.Sam Mendes has directed ...,1


In [10]:
df.shape

(50000, 2)

In [11]:
import sklearn

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(['The sun is shining',
                'The weather is sweet',
                'The sun is shining, the weather is sweet and one and one is two'])

bag = count.fit_transform(docs)

In [13]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [14]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [15]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [16]:
df.loc[1, 'review'][-50:]

' to the first group, my vote is eight.<br /><br />'

In [17]:
import re

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

In [18]:
preprocessor(df.loc[1, 'review'][-50:])

' to the first group my vote is eight '

In [19]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :)'

In [20]:
df['review'] = df['review'].apply(preprocessor)

In [21]:
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [22]:
!pip install nltk



In [23]:
import nltk

from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running thus they run')

['runner', 'like', 'run', 'thu', 'they', 'run']

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jordansamek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lor')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lor']

In [26]:
X_train = df.loc[:25000, 'review'].values
X_test = df.loc[25000:, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
y_test = df.loc[25000:, 'sentiment'].values

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer,
                                 tokenizer_porter],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range': [(1, 1)],
             'vect__stop_words': [stop, None],
             'vect__tokenizer': [tokenizer,
                                tokenizer_porter],
             'vect__use_idf': [False],
             'vect__norm': [None],
             'clf__penalty': ['l1', 'l2'],
             'clf__C': [1.0, 10.0, 100.0]}
             ]

lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf', LogisticRegression(random_state=42, solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=42,
                                                           solver='liblinear'))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you'v...
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
 

In [28]:
print("Best parameter set: %s" % gs_lr_tfidf.best_params_)

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7fa0ce431dd0>}


In [29]:
print("Accuracy score: %.3f" % gs_lr_tfidf.best_score_)

Accuracy score: 0.893


In [30]:
clf = gs_lr_tfidf.best_estimator_
print("Test accuracy: %.3f" % clf.score(X_test, y_test))

Test accuracy: 0.898


In [31]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [32]:
next(stream_docs(path='./data/movie_data.csv'))

('"I was taken to this film by a friend and was sceptical about a Swedish film with subtitles. However, I thoroughly enjoyed every minute of this beautiful film. The unnecessary cruelty that man is capable of was portrayed confidently without overwhelming images - although animal lovers may have to shield their eyes for a brief couple of seconds somewhere during the first 10 minutes. A traditional story of humility versus brutality and hope versus tragedy was illustrated from a satisfyingly fresh angle using a spectrum of characters with very natural flaws and features. I particularly liked how the film managed to address multiple aspects of hypocritical human behaviour that concern bias, discrimination and sanctimonious pretence. An absolute gem of a film that I will promote to all who will listen."',
 1)

In [33]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [34]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=42)
doc_stream = stream_docs(path='./data/movie_data.csv')

In [35]:
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:03


In [36]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print("Accuracy: %.3f" % clf.score(X_test, y_test))

Accuracy: 0.808


In [37]:
clf = clf.partial_fit(X_test, y_test)

In [38]:
# Latent Dirichlet Allocation

count = CountVectorizer(stop_words='english',
                       max_df=.1,
                       max_features=5000)

X = count.fit_transform(df['review'].values)

In [39]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10,
                               random_state=42,
                               learning_method='batch')

X_topics = lda.fit_transform(X)

In [40]:
lda.components_.shape

(10, 5000)

In [41]:
n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic 1:
dvd tv video school watched
Topic 2:
role music performance actor play
Topic 3:
book novel version read murder
Topic 4:
horror gore sex thriller night
Topic 5:
guy worst stupid minutes game
Topic 6:
father wife mother family woman
Topic 7:
comedy series kids episode fun
Topic 8:
feel audience documentary different cinema
Topic 9:
war american men police country
Topic 10:
action effects budget special low


In [42]:
horror = X_topics[:, 3].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')


Horror movie #1:
j s cardone directed a little known video nasty in 1982 called the slayer and since then has gone on to have a hand in a handful of feature films including the rubbish 2001 vampire movie the forsaken his latest feature film wicked little things boasts a plot that sounds decent as well as a creepy lo ...

Horror movie #2:
i m a big fan of lucio fulci many of his giallo and splatter flicks are amongst my favourites of all time but this made for tv movie is extremely sub par and not what i ve come to expect from the great italian director the film is neither interesting like some of fulci s more tame giallo s or gory l ...

Horror movie #3:
eye in the labyrinth is not your average giallo and to be honest i m not really sure that it really is a giallo but giallo or not despite some problems this is certainly a very interesting little film i m hesitant to call it a giallo because the film doesn t feature most of the things that make thes ...


# CH09 - Embedding a Machine Learning Model into a Web Application

In [2]:
import pickle 
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

NameError: name 'stop' is not defined

In [3]:
import pickle
import re
import os
from vectorizer import vect

clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))

ModuleNotFoundError: No module named 'vectorizer'