# Jonathan Halverson
# Sunday, April 17, 2016
# NLP and sentiment analysis 

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

In [20]:
docs = np.array(['The sun is shining.', 'The weather is sweet.', 'The sun is shining and the weather is sweet.'])

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bag = count.fit_transform(docs)

In [22]:
for word, i in sorted(count.vocabulary_.iteritems()):
    print i, word

0 and
1 is
2 shining
3 sun
4 sweet
5 the
6 weather


Below the bag of words feature vectors are displayed:

In [23]:
print bag.toarray()

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [24]:
count2 = CountVectorizer(ngram_range=(1,2))
bag2 = count2.fit_transform(docs)
print count2.vocabulary_
print bag2.toarray()

{u'the sun': 11, u'and': 0, u'the weather': 12, u'shining and': 6, u'sun': 7, u'is': 2, u'sun is': 8, u'and the': 1, u'weather is': 14, u'weather': 13, u'sweet': 9, u'the': 10, u'is sweet': 4, u'is shining': 3, u'shining': 5}
[[0 0 1 1 0 1 0 1 1 0 1 1 0 0 0]
 [0 0 1 0 1 0 0 0 0 1 1 0 1 1 1]
 [1 1 2 1 1 1 1 1 1 1 2 1 1 1 1]]


In [25]:
for word, i in sorted(count2.vocabulary_.iteritems()):
    print i, word

0 and
1 and the
2 is
3 is shining
4 is sweet
5 shining
6 shining and
7 sun
8 sun is
9 sweet
10 the
11 the sun
12 the weather
13 weather
14 weather is


### Term frequency-inverse document frequency

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
tbl = tfidf.fit_transform(bag).toarray()
print tbl

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


Let's check that the feature vectors are normalized:

In [27]:
print [np.linalg.norm(tbl[i]) for i in [0, 1, 2]]

[1.0000000000000002, 1.0000000000000002, 1.0]


# Sentiment analysis of IMDB movie reviews

Data obtained from: http://ai.stanford.edu/~amaas/data/sentiment/

In [28]:
import os
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('train', 'test'):
    for l in ('pos', 'neg'):
        path = '/Users/jhalverson/Downloads/aclImdb/' + s + '/' + l
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
df.columns = ['review', 'sentiment']
df.head()

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [29]:
df.iloc[0, 0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [30]:
import re
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer

# use the partial module to remove duplicate code from these two methods
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'lxml').get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    return " ".join(words)

def review_to_words_porter(raw_review):
    review_text = BeautifulSoup(raw_review, 'lxml').get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    porter = PorterStemmer()
    return " ".join(porter.stem(word) for word in words)

Let's apply the tokenizers to the first review:

In [31]:
print review_to_words(df.iloc[0, 0])

bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my years in the teaching profession lead me to believe that bromwell high s satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i m here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn t


In [32]:
print review_to_words_porter(df.iloc[0, 0])

bromwel high is a cartoon comedi it ran at the same time as some other program about school life such as teacher my year in the teach profess lead me to believ that bromwel high s satir is much closer to realiti than is teacher the scrambl to surviv financi the insight student who can see right through their pathet teacher pomp the petti of the whole situat all remind me of the school i knew and their student when i saw the episod in which a student repeatedli tri to burn down the school i immedi recal at high a classic line inspector i m here to sack one of your teacher student welcom to bromwel high i expect that mani adult of my age think that bromwel high is far fetch what a piti that it isn t


The Porter stemmer helps to reduce the number of unique words. Other stemmers may be used.

### Prepare data and build model

In [33]:
X_train = df.iloc[:25000, 0].values
y_train = df.iloc[:25000, 1].values
X_test = df.iloc[25000:, 0].values
y_test = df.iloc[25000:, 1].values

In [34]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stops = stopwords.words("english")

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, max_features=7500)
param_grid = [{'vect__ngram_range': [(1, 3)], 'vect__stop_words': [stops, None],
               'vect__tokenizer': [review_to_words, review_to_words_porter],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 3)], 'vect__stop_words': [stops, None],
               'vect__tokenizer': [review_to_words, review_to_words_porter],
               'clf__C': [1.0, 10.0, 100.0],
               'vect__use_idf': [False], 'vect__norm': [None]}]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression())])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 131.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=7500, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 3)], 'vect__tokenizer': [<function review_to_words at 0x116ce0938>, <function review_to_words_porter at 0x116ce0aa0>], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yo...on review_to_words_porter at 0x116ce0aa0>], 'vect__use_idf': [False], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1

In [35]:
print gs_lr_tfidf.best_params_

{'vect__ngram_range': (1, 3), 'vect__tokenizer': <function review_to_words at 0x116ce0938>, 'clf__C': 10.0, 'vect__stop_words': None}


Next we use the best classifier to compute the accuracy of the model on the test data:

In [36]:
clf = gs_lr_tfidf.best_estimator_
print 'Accuracy (train):', clf.score(X_train, y_train)
print 'Accuracy (test):', clf.score(X_test, y_test)

Accuracy (train): 0.88676
Accuracy (test): 0.85452


We see that this relatively simple approach leads to a good accuracy at predicting the sentiment of movie reviews. One could extend this work by considering spell correcting, associations, n-grams beyond (1, 1) and using different grammars. A popular library for text analysis is word2vec.

For very large data sets or online processing one may consider using out-of-core methods such as SGDClassifier with a Hashing vectorizer.