# Example 3. Sentiment training with IMBd data, out-of-core learning
---

### Import package and function definition

In [1]:
import re
import nltk
import pandas as pd
import numpy  as np

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text) # remove html flag, e.g. <br />
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower())+' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

[nltk_data] Downloading package stopwords to /Users/Alpha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
# test:
tmp = stream_docs(path='../data/imbd.csv.train')
print next(tmp)
print next(tmp)

('"... but the trouble of this production is that it\'s very far from a good musical.<br /><br />Granted, one can\'t always expect the witty masters like Sondheim or Bernstein or Porter; yet the music of this piece makes even Andrew Lloyd Webber look witty. It\'s deadly dull and uninventive (with one or two exceptions) and just after I watched it I couldn\'t recall a single significant melody - which is rather tragic coming from someone who learned the whole Another Hundred People from three listenings.<br /><br />It is also strangely un-theatrical. It takes place on an incredibly large stage (one really has to feel sorry for those people in front rows who broke their necks in order to see something happening 50 meters on the right or 100 meters on the left) and does absolutely nothing with it. When there\'s supposed to be one person singing on-stage, that\'s just what you get - and the rest of the enormeous stage is empty. For me as an aspiring theatre director it was almost painful t

In [4]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try: 
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y
# test:
get_minibatch(stream_docs(path='../data/imbd.csv.train'), size=2)
get_minibatch(stream_docs(path='../data/imbd.csv.train'), size=2)

(['"... but the trouble of this production is that it\'s very far from a good musical.<br /><br />Granted, one can\'t always expect the witty masters like Sondheim or Bernstein or Porter; yet the music of this piece makes even Andrew Lloyd Webber look witty. It\'s deadly dull and uninventive (with one or two exceptions) and just after I watched it I couldn\'t recall a single significant melody - which is rather tragic coming from someone who learned the whole Another Hundred People from three listenings.<br /><br />It is also strangely un-theatrical. It takes place on an incredibly large stage (one really has to feel sorry for those people in front rows who broke their necks in order to see something happening 50 meters on the right or 100 meters on the left) and does absolutely nothing with it. When there\'s supposed to be one person singing on-stage, that\'s just what you get - and the rest of the enormeous stage is empty. For me as an aspiring theatre director it was almost painful 

In [5]:
np.random.seed(0)
df1 = pd.read_csv('../data/imbd.csv.train')
df2 = pd.read_csv('../data/imbd.csv.test')
df  = df2.append(df1, ignore_index=True)
df.reindex(np.random.permutation(df.index))
df.to_csv('../data/imbd.csv', index=False)
df.head()
len(df)

50000

### Train

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [7]:
vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='../data/imbd.csv')

In [8]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    #print _, X_train
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:49


In [9]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print 'Accuracy: %.3f'% clf.score(X_test, y_test)

Accuracy: 0.869


In [10]:
# Update the model with test data
clf.partial_fit(X_test, y_test)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=1, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=1, shuffle=True, verbose=0,
       warm_start=False)