In [1]:
"""Build a sentiment analysis / polarity model

Sentiment analysis can be casted as a binary text classification problem,
that is fitting a linear classifier on features extracted from the text
of the user messages so as to guess wether the opinion of the author is
positive or negative.

In this examples we will use a movie review dataset.

"""

import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics



In [7]:
def test_classifier(clf):
    y_predicted = clf.predict(docs_test)
    print(metrics.classification_report(y_test, y_predicted,
                                        target_names=dataset.target_names))


## Load data

In [8]:
movie_reviews_data_folder = "data/movie_reviews/txt_sentoken/"
dataset = load_files(movie_reviews_data_folder, shuffle=False)
print("n_samples: %d" % len(dataset.data))

# split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=None)

n_samples: 2000


In [29]:
text_clf = Pipeline([('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
                     ('clf', LinearSVC())])
text_clf.fit(docs_train, y_train)
test_classifier(text_clf)

             precision    recall  f1-score   support

        neg       0.86      0.84      0.85       263
        pos       0.83      0.85      0.84       237

avg / total       0.84      0.84      0.84       500



In [16]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'vect__use_idf': (True, False)}

In [32]:
gs_clf = GridSearchCV(text_clf, parameters)
gs_clf = gs_clf.fit(docs_train, y_train)

In [18]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

vect__ngram_range: (1, 2)
vect__use_idf: True


In [34]:
text_clf = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2),
                                              use_idf=True,
                                              min_df=3, max_df=0.95)),
                     ('clf', LinearSVC())])
text_clf.fit(docs_train, y_train)
test_classifier(text_clf)

             precision    recall  f1-score   support

        neg       0.88      0.86      0.87       263
        pos       0.85      0.87      0.86       237

avg / total       0.87      0.87      0.87       500

