# Example 2. Sentiment training with IMBd data, Grid Searching
---

### Import package and function definition

In [1]:
import re
import pandas as pd
import numpy  as np

In [2]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove html flag, e.g. <br />
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())+' '.join(emoticons).replace('-','')
    return text
# test: 
preprocessor('</a>This :) is :( a test :-)!;')

'this is a test :) :( :)'

In [3]:
def tokenizer(text):
    return text.split()

In [4]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
# test:
tokenizer_porter('runners like running and thus they run')

[u'runner', 'like', u'run', 'and', u'thu', 'they', 'run']

### 1. Import IMBd movie review data

In [5]:
# Due to the huge size, keeps splitted train and test
df_train = pd.read_csv('../data/imbd.csv.train')
df_test  = pd.read_csv('../data/imbd.csv.test')

In [6]:
np.random.seed(0)
df = df_test.append(df_train, ignore_index=True)
df = df.reindex(np.random.permutation(df.index))
df.head()

Unnamed: 0,review,sentiment
11841,"Yep, this has got to be one of the lamest movi...",0
19602,"Gilmore Girls is one of the funniest, most cle...",1
45519,Good luck finding this film to even watch - it...,1
25747,Having watched 10 minutes of this movie I was ...,0
42642,I really hate most end of the world movies. Th...,0


In [7]:
df['review'] = df['review'].apply(preprocessor)
df.head()

Unnamed: 0,review,sentiment
11841,yep this has got to be one of the lamest movie...,0
19602,gilmore girls is one of the funniest most clev...,1
45519,good luck finding this film to even watch it s...,1
25747,having watched 10 minutes of this movie i was ...,0
42642,i really hate most end of the world movies the...,0


In [8]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test  = df.loc[25000:, 'review'].values
y_test  = df.loc[25000:, 'sentiment'].values

### 2. Training with grid-search of parameters

In [9]:
import nltk
from sklearn.model_selection         import GridSearchCV
from sklearn.model_selection         import cross_val_score
from sklearn.pipeline                import Pipeline
from sklearn.linear_model            import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/Alpha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### 2.1. Grid search 

In [11]:
stop = stopwords.words('english')
lr_tfidf = Pipeline([('vect', TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)),
                     ('clf',  LogisticRegression(random_state=0))])

In [12]:
param_grid = [ # defualt: use_idf=True, smooth_idf=True, norm=12
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words' : [stop, None],
               'vect__tokenizer'  : [tokenizer, tokenizer_porter],
               'clf__penalty'     : ['l1','l2'],
               'clf__C'           : [1., 10., 100.]},
               # defualt: use_idf=False, smooth_idf=False, norm=None
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words' : [stop, None],
               'vect__tokenizer'  : [tokenizer, tokenizer_porter],
               'vect__use_idf'    : [False],
               'vect__smooth_idf' : [False],
               'vect__norm'       : [None],
               'clf__penalty'     : ['l1','l2'],
               'clf__C'           : [1., 10., 100.]}]

In [13]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
# gs_lr_tfidf.fit(X_train, y_train)
# print 'Best parameter set: %s' % gs_lr_tfidf.best_param_
# print 'CV Accuracy: %.3f' %      gs_lr_tfidf.best_score_
# print 'Test Accuracy: %.3f'%     gs_lr_tfidf.best_estimator_.score(X_test, y_test)

## But it take too long about > 40mins....
## The best parameter set : 
## {'clf__C':10.0, 'vect__stop_words':None, 'clf__penalty':'l2', 'vect__tokenizer': <function tokenizer>}, 'vect__ngram_range': (1,1)}

#### 2.2. Training with grid-search results

In [14]:
params = {'clf__C':10.0, 
          'clf__penalty':'l2', 
          'vect__stop_words':None, 
          'vect__tokenizer':tokenizer, 
          'vect__ngram_range': (1,1)}
lr_tfidf.set_params(**params) # lr_tfidf.get_params()
lr_tfidf.fit(X_train, y_train)

Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [15]:
scores = cross_val_score(estimator=lr_tfidf, X=X_train, y=y_train, cv=10, n_jobs=1)
print 'CV Accuracy: %.3f' %  np.mean(scores)
print 'Test Accuracy: %.3f'% lr_tfidf.score(X_test,  y_test)

CV Accuracy: 0.892
Test Accuracy: 0.895


### 3. Conclusion
Here I give the example to train the IMBd movie review according to the basic NLP techniques as in [Example 1](example_01_basicNLP.ipynb). Before the training, the grid-search is used for parameter optimization. However, the searching algorithm is too slow and inefficient, although it gives the good performace in the prediction from data. Thus, in next example, [Example 3](example_03_outofcore), I am going to introduce an alternative searching method for optimization, so call "***out-of-core learning***". 