In [1]:
%matplotlib inline
import pandas as pd
from bs4 import BeautifulSoup

train = pd.read_csv('labeledTrainData.tsv',header=0,delimiter='\t',quoting=3)
ex1 = BeautifulSoup(train["review"][0],'html.parser')
#print train.info
#print train["review"][0]
#print ex1.get_text()

For the sake of simplicity we're going to trim down the reviews to just letters, ignoring symbols and numbers for now. To do this we'll use the regex library re and then we'll tokenise the reviews.

In [2]:
import re

# Use regular expressions to do a find-and-replace:
# Find anything that is NOT a lowercase letter (a-z)
# or an upper case letter (A-Z), and replace it with a space

letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      ex1.get_text() )  # The text to search
print letters_only[:50] + '\n'

# Now do tokenisation : convert to all lowercase and run .split

lower_case = letters_only.lower()        # Convert to lower case
words = lower_case.split()               # Split into words

print words[:20]

 With all this stuff going down at the moment with

[u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the']


Now we'll get rid of stopwords with the NLTK library

In [23]:
#import nltk
#nltk.download()  # Download text data sets, including stop words
# Remove stop words from "words"
stopwords = open('english').read()
words = [w for w in words if not w in stopwords]
print words[:30]

[u'stuff', u'going', u'moment', u'mj', u'started', u'listening', u'music', u'watching', u'odd', u'documentary', u'watched', u'wiz', u'watched', u'moonwalker', u'maybe', u'want', u'get', u'certain', u'insight', u'guy', u'thought', u'really', u'cool', u'eighties', u'maybe', u'make', u'mind', u'whether', u'guilty', u'innocent']


In [30]:
def cleaner(raw):
    """ Create function to do the above"""
    raw = BeautifulSoup(raw,'html.parser').get_text()
    raw = re.sub("[^a-zA-Z]", " ",raw).lower().split()
    tokenised = [w for w in raw if not w in stopwords]
    return " ".join(tokenised)

#print train["review"][0][:100]
#print clean_review(train["review"][0])[:100]

Now we'll get a big list of the cleaned up reviews:

In [5]:
clean_reviews = []
 
for review in train["review"]:
    clean_reviews.append(cleaner(review))


To do some machine learning we need to change the reviews to a numerical format - we'll use the Bag of Words technique, counting the occurence of the 5000 most frequent words from the reviews.

sci-kit learn's bag of words counter is CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)  

# fit model, learn vocab, vectorise and then convert to numpy array
train_data_features = vectorizer.fit_transform(clean_reviews).toarray()


We can also count the number of times each word appears in all the reviews. Here's the first 5:

In [22]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vectorizer.get_feature_names(), dist)[:5]:
    print count, tag

187 abandoned
125 abc
108 abilities
454 ability
1259 able


## Random Forest Algorithm

Now we're going to do some Machine Learning using RF

In [25]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 100) 

clf = clf.fit( train_data_features, train["sentiment"] )

In [33]:
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

# Verify that there are 25,000 rows and 2 columns
print test.shape

# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])
clean_test_reviews = [] 

print "Cleaning and parsing the test set movie reviews...\n"
for i in xrange(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print "Review %d of %d\n" % (i+1, num_reviews)
   # print test["review"][i]
    clean_review = cleaner( test["review"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = clf.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )

(25000, 2)
Cleaning and parsing the test set movie reviews...

Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000

