In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# library for extracting text from HTML files
from bs4 import BeautifulSoup   

# library for stemming, removing stopwords, etc.,
'''
Make sure to run at command line

python -m nltk.downloader all-corpora

to grab all corpus, including stopwords
 
Or run for much faster download just 

nltk.download("stopwords") 

in python once to grab just stopwords

'''
import nltk
nltk.download("stopwords") 
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.porter import *
stemmer = PorterStemmer()

# import regular expressions for tokenization
import re

# import scikit learn BoW transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# import training data
import pandas as pd       
train = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [3]:
# remove document from HTML page, parse, remove stop words, and stem
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    
    # Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    
    # Remove stop words 
    meaningful_words = [w for w in words if not w in stops]   
    
    # Stem the word list - no built in functionality in scikit, although you can directly import the nlkt stemmer
    stemmed_words = [stemmer.stem(word) for word in meaningful_words]
    
    # Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( stemmed_words )) 

In [4]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in xrange( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Review %d of %d\n" % ( i+1, num_reviews )   
    
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_reviews.append( review_to_words( train["review"][i] ) )



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))


Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



In [5]:
## convert cleaned, stopword removed, stemmed dataset to BoW features
def transform_to_BoW(clean_train_reviews):
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.  Keep only top 5000 most commonly occuring words
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 5000) 

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_features = train_data_features.toarray()
    
    # Take a look at the words in the vocabulary
    vocab = vectorizer.get_feature_names()

    return train_data_features

In [6]:
# run function to grab BoW representation of input
train_data_features = transform_to_BoW(clean_train_reviews)

# grab labels
labels = np.asarray(train['sentiment'])

In [23]:
# split dataset into training and testing sets
X_train = train_data_features[:20000,:]
y_train = labels[:20000]

X_test = train_data_features[20000:,:]
y_test = labels[20000:]

In [24]:
# load in classifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [25]:
# fit classifier to training data
clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)

In [26]:
# print scores on training and testing sets
print 'accuracy on training set is ' + str(clf.score(X_train, y_train))  
print 'accuracy on testing set is ' + str(clf.score(X_test, y_test))  

accuracy on training set is 0.83845
accuracy on testing set is 0.8342
