In [1]:
"""Build a sentiment analysis / polarity model

Sentiment analysis can be casted as a binary text classification problem,
that is fitting a linear classifier on features extracted from the text
of the user messages so as to guess wether the opinion of the author is
positive or negative.

In this examples we will use a movie review dataset.

"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD

import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics


if __name__ == "__main__":
    # NOTE: we put the following in a 'if __name__ == "__main__"' protected
    # block to be able to use a multi-core grid search that also works under
    # Windows, see: http://docs.python.org/library/multiprocessing.html#windows
    # The multiprocessing module is used as the backend of joblib.Parallel
    # that is used when n_jobs != 1 in GridSearchCV

    # the training data folder must be passed as first argument
    movie_reviews_data_folder = "/home/students/klugeh01/cs420/sklearn/scikit-learn/doc/tutorial/text_analytics/data/languages/paragraphs"
    dataset = load_files(movie_reviews_data_folder, shuffle=False)
    print("n_samples: %d" % len(dataset.data))

    # split the dataset in training and test set:
    docs_train, docs_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=None)

    # TASK: Build a vectorizer / classifier pipeline that filters out tokens
    # that are too rare or too frequent
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(analyzer='char',lowercase=True,ngram_range=(1,3))),
        ('svm', LinearSVC(C=1000,max_iter=1000)),
    ])

    # TASK: Build a grid search to find out whether unigrams or bigrams are
    # more useful.
    # Fit the pipeline on the training set using grid search for the parameters
    from sklearn.grid_search import GridSearchCV
    parameters = {'vect__min_df': (1,2,3),
                  'vect__max_df': (.27,.28,.29)
                 }
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
    grid_search.fit(docs_train, y_train)

    

    # TASK: print the cross-validated scores for the each parameters set
    # explored by the grid search
    #print(grid_search.grid_scores_)
    best_parameters, score, _ = max(grid_search.grid_scores_, key=lambda x: x[1])
    for param_name in sorted(parameters.keys()):
         print("%s: %r" % (param_name, best_parameters[param_name]))

    # TASK: Predict the outcome on the testing set and store it in a variable
    # named y_predicted
    y_predicted = grid_search.predict(docs_test)

    # Print the classification report
    print(metrics.classification_report(y_test, y_predicted,
                                        target_names=dataset.target_names))

    # Print and plot the confusion matrix
    cm = metrics.confusion_matrix(y_test, y_predicted)
    print(cm)

    import matplotlib.pyplot as plt
    plt.matshow(cm)
    plt.show()


n_samples: 883
vect__max_df: 0.27
vect__min_df: 1
             precision    recall  f1-score   support

         ar       1.00      1.00      1.00         6
         de       1.00      0.96      0.98        23
         en       1.00      1.00      1.00        33
         es       1.00      1.00      1.00        36
         fr       0.95      1.00      0.98        21
         it       1.00      1.00      1.00        17
         ja       1.00      1.00      1.00        16
         nl       1.00      1.00      1.00        14
         pl       1.00      1.00      1.00         5
         pt       1.00      1.00      1.00        28
         ru       1.00      1.00      1.00        22

avg / total       1.00      1.00      1.00       221

[[ 6  0  0  0  0  0  0  0  0  0  0]
 [ 0 22  0  0  1  0  0  0  0  0  0]
 [ 0  0 33  0  0  0  0  0  0  0  0]
 [ 0  0  0 36  0  0  0  0  0  0  0]
 [ 0  0  0  0 21  0  0  0  0  0  0]
 [ 0  0  0  0  0 17  0  0  0  0  0]
 [ 0  0  0  0  0  0 16  0  0  0  0]
 [ 0  

RuntimeError: Invalid DISPLAY variable

In [None]:
print(classification_report(y_test, predicted, target_names=dataset.target_names))

plt.imshow(np.log(confusion_matrix(twenty_test, predicted)), cmap = 'Blues', interpolation = 'nearest')
plt.grid(False)
plt.ylabel('true')
plt.xlabel('predicted')

In [None]:

import numpy as np
import random
import pandas as pd
import seaborn as sbn
sbn.set()
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

#probably some of these imports are unnecessary

dataset = fetch_20newsgroups(shuffle=True, random_state=None)

twenty_train, twenty_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=None)

clf = Pipeline([
            ("vec", TfidfVectorizer(analyzer='char', lowercase=True)),
            ("svm", LinearSVC(C=1000, max_iter=10000))
        ])

parameters = {"vec__ngram_range": [(1,1),(1,2),(1,3),(1,4)],
              "vec__min_df": (2, 4, 6, 8),
              "vec__max_df": (.05, .1, .15, .2, .25, .3, .35, .4)
                 }

grid_clf = GridSearchCV(clf, parameters, n_jobs=-1)
    
grid_clf.fit(twenty_train, y_train)

predicted = grid_clf.predict(twenty_test)

print(classification_report(y_test, predicted, target_names=y_test.target_names))

plt.imshow(np.log(confusion_matrix(twenty_test.target, predicted)), cmap = 'Blues', interpolation = 'nearest')
plt.grid(False)
plt.ylabel('true')
plt.xlabel('predicted')

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
print(score)