In [18]:
import spacy
# for imdb data
import thinc.extra.datasets

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin 
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score 

import string
import time
import re

In [7]:
# download data
print("loading imdb dataset ...")
train_data, _ = thinc.extra.datasets.imdb()
#数据格式为(body, label)
#print(train_data[0])
#print(train_data[1])

loading imdb dataset ...
Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz










In [49]:
#prepare for model training, shuffle, train/dev, split
X,y = zip(*train_data)
X_train, X_test, y_train, y_test = train_test_split(X[:500], y[:500], test_size=0.2, random_state=123)

In [50]:
print(len(X_train), len(y_train), len(X_test), len(y_test))

400 400 100 100


In [51]:
#create data clearner
#Custom transformer using spaCy 
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()

    return text

In [52]:
#create vectorizer

#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
# List of symbols we don't care about
punctuations = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]
parser = spacy.load('en')
def tokenizeText(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]  
    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
    return tokens

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(tokenizer = tokenizeText, ngram_range=(1,2))

In [53]:
#choose model
classifier = LinearSVC(verbose = True, random_state = 123, max_iter = 100)

In [54]:
#train model
start_time = time.time()
pipe = Pipeline([("cleaner", CleanTextTransformer()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])
pipe.fit(X_train, y_train)
predict = pipe.predict(X_test)
score = accuracy_score(y_test, predict)
end_time = time.time()
print("Done in {} Seconds".format(end_time - start_time))
print("accuracy:{}".format(score))

[LibLinear]Done in 24.13919186592102 Seconds
accuracy:0.73


In [55]:
def printNMostInformative(vectorizer, clf, N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)

In [56]:
print("----------------------------------------------------------------------------------------------")
print("Top 10 features used to predict: ")
# show the top features
printNMostInformative(vectorizer, classifier, 10)

----------------------------------------------------------------------------------------------
Top 10 features used to predict: 
Class 1 best: 
(-0.18615388049541012, 'movie terrible')
(-0.18156791566801592, 'good effect')
(-0.18156378693270692, 'terrible')
(-0.18143862962493054, 'terrible good')
(-0.1810951234163003, 'effect')
(-0.13925327861965167, 'bad')
(-0.12333490454755958, 'waste')
(-0.12248637115474105, 'script')
(-0.11113828093686319, 'britney')
(-0.1045035715886689, 'hour')
Class 2 best: 
(0.19386932630171824, 'great')
(0.12236973060465213, 'wonderful')
(0.11351432783644279, 'excellent')
(0.10096004110604245, 'heart')
(0.0996951705892341, 'remember')
(0.09901801747389347, 'think')
(0.08765443300478477, 'know')
(0.08604219761411273, 'bring')
(0.08404097835541466, 'favorite')
(0.07970897384495353, 'love')


In [57]:
print("----------------------------------------------------------------------------------------------")
print("The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc")
# let's see what the pipeline was transforming the data into
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
transform = pipe.fit_transform(X_train[:30], y_train[:30])

# get the features that the vectorizer learned (its vocabulary)
vocab = vectorizer.get_feature_names()

# the values from the vectorizer transformed data (each item is a row,column index with value as # times occuring in the sample, stored as a sparse matrix)
for i in range(len(X_train[:2])):
    s = ""
    indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
    numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
    for idx, num in zip(indexIntoVocab, numOccurences):
        s += str((vocab[idx], num))
    print("Sample {}: {}\n".format(i, s))
    
print(X_train[0])
print(X_train[1])

----------------------------------------------------------------------------------------------
The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc
Sample 0: ('lovable movie', 1)('sequel lovable', 1)('poor sequel', 1)('tv poor', 1)('make tv', 1)('movie make', 1)('agree movie', 1)('right agree', 1)('critic right', 1)('bad critic', 1)('movie bad', 1)('surprise movie', 1)('lady surprise', 1)('british lady', 1)('play british', 1)('collin play', 1)('sign joan', 1)('annie sign', 1)('hannigan annie', 1)('miss hannigan', 1)('warbuck miss', 1)('daddy warbuck', 1)('like daddy', 1)('role like', 1)('major role', 1)('play major', 1)('actor play', 1)('good actor', 1)('ridiculous good', 1)('hogbottom ridiculous', 1)('edwina hogbottom', 1)('lady edwina', 1)('collin lady', 1)('joan collin', 2)('add joan', 1)('movie add', 1)('carry movie', 1)('talent carry', 1)('think talent', 1)('know think', 1)('movie know', 1)('bit movie', 1)('pain bit', 1)('grow pain', 1

This astonishing waste of production money is filmic proof that the rich and famous can be just as stupid and wasteful as politicians. From a (silly) play by Tennessee Williams and directed (with a dead hand) by Joseph Losey and starring Taylor and Burton and Noel Coward - this project filmed in a spectacular cliff-top mountain island mansion in the Mediterranean must have seemed a sure fire winner when presented to Universal in 1967. The result is so absurd and tedious that it almost defies belief. Visually the film is spectacular but that is the force of nature that has allowed the setting and the fact that a real home is used instead of a set. The shrill antics of a screeching Taylor, Burton's half asleep wanderings, the loony dialog, Noel Coward laughing at himself, the ridiculous story and plot devices and the absurd costuming simply irritate the viewer. BOOM is a disgrace, a waste of money and talent and clear proof that lauded famous people can be idiots just like the rest of th