### Supervised, sentiment classifer 

In [1]:
import nltk 
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [2]:
# nltk.download_shell()
# install punkt
# install wordnet 
# install stopwords 

### Load data

In [3]:
## setup setming and stop words 
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')

## you can also import your own stopword list from a file 
## from http://www.lextek.com/manuals/onix/stopwords1.html
#  stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [31]:
# load the reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('data/positive.review').read(),'lxml')
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('data/negative.review').read(),'lxml')
negative_reviews = negative_reviews.findAll('review_text')

In [34]:
# there are more positive reviews than negative reviews
# so let's take a random sample so we have balanced classes
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

### Create our own tokenizer

- Here we built the tokenizer and coutvectornizer by ourselves. 
- You can also just use (from sklearn.feature_extraction.text import CountVectorizer)

In [35]:
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [36]:
## see how tokenizer works 
s = 'i am a dog, i love dog, i hate dog.'
test = my_tokenizer(s)
test

['dog', 'love', 'dog', 'hate', 'dog']

In [37]:
# create a word-to-index map so that we can create our word-frequency vectors later
# let's also save the tokenized versions so we don't have to tokenize again later
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)              ## list of list of tokens from each review
    for token in tokens:
        if token not in word_index_map:            ## check if string is in dict keys 
            word_index_map[token] = current_index  ## if it is a new word, add it in 
            current_index += 1
            
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [38]:
## see how token to vector works 
tokens_to_vector(positive_tokenized[0],1)   ## it return an np array with weights for each 
                                            ## token and the label in the last column

array([ 0.1,  0.1,  0.1, ...,  0. ,  0. ,  1. ])

In [39]:
## N is total number of our sample 
N = len(positive_tokenized) + len(negative_tokenized)
# create data matrix, it is N by D+1 matrix, the last column is the label 
data = np.zeros((N, len(word_index_map) + 1))

## populate the data matrix, first D columns are token weights and the last column
## is the label, which is y 
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

### Run training model 

- Here, you can also use sklearn's train test split function 

In [61]:
# shuffle the data and create train/test splits
# try it multiple times!
np.random.shuffle(data)

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print ("Classification rate:", model.score(Xtest, Ytest))

Classification rate: 0.73


In [62]:
# let's look at the weights for each word
# try it with different threshold values!

## basically print out the coefficient 
threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print (word, weight)

doe -1.00399035607
n't -1.8337964508
got -0.522858283334
used 0.970138434043
wa -1.22844061761
thing -0.886798265534
first -0.696443610195
use 1.5977817017
would -0.746891304266
time -0.691433904607
small 0.620001363941
good 1.84791058138
even -0.821505657775
ha 0.601765583697
month -0.644060450518
love 0.9434789592
price 2.21187285848
could -0.533452355634
great 3.51559489587
get -1.13178842648
unit -0.655115825176
buy -0.883206933315
highly 0.851376414431
easy 1.36289010044
back -1.49552251254
little 0.768537093789
well 0.962787909159
sound 0.94960200689
perfect 0.857698782345
recommend 0.582425260976
returned -0.685497076112
need 0.591166382566
support -0.795699717634
quality 1.28582714231
excellent 1.23723995839
working -0.519232188527
worked -0.780555669843
've 0.530529403781
week -0.540690995688
cable 0.610543117268
like 0.562175300597
lot 0.533482679923
speaker 0.777110105281
item -0.948495084741
money -0.947202996709
best 0.96136333176
try -0.541867648287
pretty 0.590752311103
