In [1]:
import pandas as pd
from bs4 import BeautifulSoup 
import re
import nltk
#nltk.download()
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
#helper
def clean_review(raw_review, porter_stem = False):
    #remove html <- BeautifulSoup
    review_text = BeautifulSoup(raw_review).get_text()
    
    #remove bad chars (non-letters in this case) <- re
    #convert to lowercase and split into words <- re
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    
    #remove stopwords and apply Porter Stemmer <- nltk
    #Porter Stemmer maps verbs with same roots of various endings 
    
    stops = set(stopwords.words("english"))
    good_words = [w for w in words if w not in stops]
    
    if porter_stem:
        ps = PorterStemmer()
        good_words = [ps.stem(w) for w in good_words]

    good_text = " ".join(good_words)
    
    return good_text

In [3]:
#train model using training set
def train(porter_stem = False):
    #read file
    train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter = "\t", quoting=3)
    
    num_reviews = train['review'].size
    clean_train_reviews = []
    
    #clean all reviews
    for i in range(num_reviews):
        if i % 5000 == 0:
            print("processing review", i, "/", num_reviews)
        clean_train_reviews.append(clean_review(train['review'][i], porter_stem))
    
    #train model and apply to clean_reviews
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 5000) 

    print("finding features...")
    train_data_features = vectorizer.fit_transform(clean_train_reviews)
    
    #convert to np array - each review has 5000 features. The nth column is a count of the nth most frequent word
    train_data_features = train_data_features.toarray()

    #init random forest of 100 trees
    forest = RandomForestClassifier(n_estimators = 100) 

    #train forest on training set
    print("training forest...")
    forest = forest.fit(train_data_features, train["sentiment"] )
    
    print("done")
    
    return vectorizer, forest

In [4]:
porter_stem = False
vectorizer, forest = train(porter_stem)

processing review 0 / 25000
processing review 5000 / 25000
processing review 10000 / 25000
processing review 15000 / 25000
processing review 20000 / 25000
finding features...
training forest...
done


In [5]:
#applying model to test data

#read test data
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                       quoting=3 )

def test_fn(porter_stem = False):  
    num_reviews = len(test["review"])
    clean_test_reviews = [] 
    
    #clean reviews
    for i in range(num_reviews):
        if i % 5000 == 0:
            print("processing review", i, "/", num_reviews)
        clean_test_reviews.append(clean_review(test['review'][i], porter_stem))
    
    #transform into 5000-len vectors (but don't fit, because we don't want to re-find the most frequent words)
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    # Use the random forest to make sentiment label predictions
    result = forest.predict(test_data_features)
    
    return result

In [6]:
result = test_fn(porter_stem)

# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file

if porter_stem:
    output.to_csv( "Bag_of_Words_Porter_model.csv", index=False, quoting=3 )
else:
    output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )

processing review 0 / 25000
processing review 5000 / 25000
processing review 10000 / 25000
processing review 15000 / 25000
processing review 20000 / 25000
