# Sentimental analysis 
The competition from Kaggel "*Bag of Words Meets Bags of Popcorn*"

In [110]:
import pandas as pd
from bs4 import BeautifulSoup

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.grid_search import RandomizedSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score

from sklearn.externals import joblib

In [3]:
# you can download data from the page of competition https://www.kaggle.com/c/word2vec-nlp-tutorial/data

imbd_train = pd.read_csv('data/labeledTrainData.tsv', delimiter='\t')
imbd_test = pd.read_csv('data/testData.tsv', delimiter='\t')

In [4]:
# Number of the train and test datasets

print(imbd_train.shape)
print(imbd_test.shape)

(25000, 3)
(25000, 2)


In [5]:
# The training dataset consists balanced number of the positive and negative reviews

print(imbd_train[imbd_train.sentiment == 1].shape)

(12500, 3)


### Submission function

The helper for create submissions.

In [23]:
def make_submission(prediction, file_index):
    response = pd.DataFrame(data={'id': imbd_test.id, 'sentiment': prediction})
    response.to_csv('submissions/{}.csv'.format(file_index), index=False)
    
    print('The submission is ready {}.csv'.format(file_index))

### Create pipeline for finding the best parameters

In [119]:
pipeline = Pipeline([
    (
        'feature_processing', FeatureUnion(transformer_list=[
            ('words_processing', Pipeline([
                ('tfidf', TfidfVectorizer()),
            ])),
            ('characters_processing', Pipeline([
                ('tfidf', TfidfVectorizer(analyzer='char')),
            ]))
        ])
     ),
    ('lr', LogisticRegression(n_jobs=-1))
])

In [120]:
parameters = {
    'lr__C': (0.01, 0.1, 1, 10),
    'lr__penalty': ('l1', 'l2'),
    
    'feature_processing__words_processing__tfidf__min_df': (0, 1, 3, 5, 8),
    'feature_processing__words_processing__tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'feature_processing__words_processing__tfidf__max_features': (None, 9000, 12000, 14000),
    
    'feature_processing__characters_processing__tfidf__min_df': (0, 1, 3, 5, 8),
    'feature_processing__characters_processing__tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'feature_processing__characters_processing__tfidf__max_features': (None, 500, 1000, 1500)
}

### Grid search

In [79]:
cv = cross_validation.StratifiedShuffleSplit(imbd_train.sentiment, test_size=0.3, random_state=42)

In [80]:
# Submissions are judged on area under the ROC curve. 

grid = RandomizedSearchCV(pipeline, parameters, scoring='roc_auc', cv=cv, random_state=42)

In [82]:
%%time

grid.fit(imbd_train.review, imbd_train.sentiment);

CPU times: user 1h 58min 21s, sys: 5min 9s, total: 2h 3min 30s
Wall time: 2h


RandomizedSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 0 1], n_iter=10, test_size=0.3, random_state=42),
          error_score='raise',
          estimator=Pipeline(steps=[('feature_processing', FeatureUnion(n_jobs=1,
       transformer_list=[('words_processing', Pipeline(steps=[('selecting', ItemSelector(key='review')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'feature_processing__words_processing__tfidf__max_features': (None, 9000, 12000, 14000), 'feature_processing__characters_processing__tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)), 'feature_processing__words_processing__tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)), 'feature_proc..., 1, 10), 'feature_processing__characters_processing_

In [84]:
grid.best_score_

0.9585418808888888

In [132]:
grid.best_params_

{'feature_processing__characters_processing__tfidf__max_features': 1000,
 'feature_processing__characters_processing__tfidf__min_df': 8,
 'feature_processing__characters_processing__tfidf__ngram_range': (1, 3),
 'feature_processing__words_processing__tfidf__max_features': 14000,
 'feature_processing__words_processing__tfidf__min_df': 5,
 'feature_processing__words_processing__tfidf__ngram_range': (1, 3),
 'lr__C': 10,
 'lr__penalty': 'l2'}

#### Set best parameters to the pipeline

In [123]:
pipeline.set_params(
    feature_processing__characters_processing__tfidf__max_features=1000,
    feature_processing__characters_processing__tfidf__min_df=8,
    feature_processing__characters_processing__tfidf__ngram_range=(1, 3),
    feature_processing__words_processing__tfidf__max_features=14000,
    feature_processing__words_processing__tfidf__min_df=5,
    feature_processing__words_processing__tfidf__ngram_range=(1, 3),
    lr__C=1,
    lr__penalty='l2'
);

### Use cross_val_score for investigate final model

In [126]:
%%time

cross_val_score(pipeline, imbd_train.review, imbd_train.sentiment, scoring='roc_auc', cv=3, n_jobs=-1)

CPU times: user 1.11 s, sys: 248 ms, total: 1.36 s
Wall time: 2min 3s


array([ 0.95655167,  0.95327619,  0.95599876])

### Create final prediction

In [130]:
%%time

pipeline.fit(imbd_train.review, imbd_train.sentiment);

CPU times: user 1min 50s, sys: 9.14 s, total: 1min 59s
Wall time: 1min 50s


Pipeline(steps=[('feature_processing', FeatureUnion(n_jobs=1,
       transformer_list=[('words_processing', Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1....ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [131]:
prediction = pipeline.predict_proba(imbd_test.review)[:,1]
make_submission(prediction, 'tfidf_lr_final')

The submission is ready tfidf_lr_final.csv


#### The score for the _testing_ dataset is **0.95893**

### Data analysis of the prediction model

Let's see what words have the biggest influence to the positive and negative reviews.

In [216]:
tfidf = TfidfVectorizer(max_features=14000, min_df=5, ngram_range=(1, 3))
tfidf.fit(imbd_train.review);

In [219]:
def display_important_features(feature_names, weights, n_top=30):
    sorted_features_indices = weights[0].argsort()[::-1]
    
    print('The most important "features" (words) for the first class (positive reviews): \n')
    most_important = sorted_features_indices[:n_top]
    print(",\n".join("{0}: {1:.4f}".format(feature_names[j], weights[0, j]) for j in most_important))

    print('\nThe most unimportant "features" (words) for the second class (negative reviews): \n')
    least_important = sorted_features_indices[-n_top:]
    print(",\n".join("{0}: {1:.4f}".format(feature_names[j], weights[0, j]) for j in least_important))

In [220]:
display_important_features(tfidf.get_feature_names(), pipeline.named_steps['lr'].coef_, 15)

The most important "features" (words) for the first class (positive reviews): 

great: 5.8698,
excellent: 4.8374,
wonderful: 4.0855,
perfect: 4.0584,
amazing: 3.7342,
best: 3.4412,
fun: 3.3535,
today: 3.3492,
the best: 3.1102,
loved: 2.9518,
superb: 2.8695,
favorite: 2.8563,
love: 2.8074,
bit: 2.8020,
brilliant: 2.7977

The most unimportant "features" (words) for the second class (negative reviews): 

annoying: -3.3996,
horrible: -3.4781,
nothing: -3.5249,
no: -3.6050,
poorly: -3.6126,
dull: -3.6754,
worse: -3.8829,
terrible: -4.0776,
waste: -4.5680,
poor: -4.7210,
the worst: -5.0524,
boring: -5.1172,
awful: -5.8512,
worst: -6.5875,
bad: -6.6976
