# Jonathan Halverson
# Sunday, April 17, 2016
# NLP and sentiment analysis 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

In [None]:
docs = np.array(['The sun is shining.', 'The weather is sweet.', 'The sun is shining and the weather is sweet.'])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bag = count.fit_transform(docs)

In [None]:
for word, i in sorted(count.vocabulary_.iteritems()):
    print i, word

Below the bag of words feature vectors are displayed:

In [None]:
print bag.toarray()

In [None]:
count2 = CountVectorizer(ngram_range=(1,2))
bag2 = count2.fit_transform(docs)
print count2.vocabulary_
print bag2.toarray()

In [None]:
for word, i in sorted(count2.vocabulary_.iteritems()):
    print i, word

### Term frequency-inverse document frequency

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
tbl = tfidf.fit_transform(bag).toarray()
print tbl

Let's check that the feature vectors are normalized:

In [None]:
print [np.linalg.norm(tbl[i]) for i in [0, 1, 2]]

# Sentiment analysis of IMDB movie reviews

Data obtained from: http://ai.stanford.edu/~amaas/data/sentiment/

In [None]:
import os
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('train', 'test'):
    for l in ('pos', 'neg'):
        path = '/Users/jhalverson/Downloads/aclImdb/' + s + '/' + l
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
df.columns = ['review', 'sentiment']
df.head()

In [None]:
df.iloc[0, 0]

In [None]:
import re
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer

# use the partial module to remove duplicate code from these two methods
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'lxml').get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    return " ".join(words)

def review_to_words_porter(raw_review):
    review_text = BeautifulSoup(raw_review, 'lxml').get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    porter = PorterStemmer()
    return " ".join(porter.stem(word) for word in words)

Let's apply the tokenizers to the first review:

In [None]:
print review_to_words(df.iloc[0, 0])

In [None]:
print review_to_words_porter(df.iloc[0, 0])

The Porter stemmer helps to reduce the number of unique words. Other stemmers may be used.

### Prepare data and build model

In [None]:
X_train = df.iloc[:25000, 0].values
y_train = df.iloc[:25000, 1].values
X_test = df.iloc[25000:, 0].values
y_test = df.iloc[25000:, 1].values

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stops = stopwords.words("english")

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, max_features=100)
param_grid = [{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [stops, None],
               'vect__tokenizer': [review_to_words, review_to_words_porter],
               'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)], 'vect__stop_words': [stops, None],
               'vect__tokenizer': [review_to_words, review_to_words_porter],
               'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0],
               'vect__use_idf': [False], 'vect__norm': [None]}]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression())])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

In [None]:
print gs_lr_tfidf.best_params_

Next we use the best classifier to compute the accuracy of the model on the test data:

In [None]:
clf = gs_lr_tfidf.best_estimator_
print 'Accuracy (train):', clf.score(X_train, y_train)
print 'Accuracy (test):', clf.score(X_test, y_test)

We see that this relatively simple approach leads to a good accuracy at predicting the sentiment of movie reviews. One could extend this work by considering spell correcting, associations, n-grams beyond (1, 1) and using different grammars. A popular library for text analysis is word2vec.

For very large data sets or online processing one may consider using out-of-core methods such as SGDClassifier with a Hashing vectorizer.