In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [30]:
trainData = pd.read_csv("sentence-sentiment-analysis/train.tsv", header=0, delimiter="\t", quoting=3)
testData = pd.read_csv("sentence-sentiment-analysis/test.tsv", header=0, delimiter="\t", quoting=3)
trainData.columns.values
# trainData.shape
# trainData["text"]
trainData["text"][0]

"Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"

In [16]:
# data cleaning and text preprocessing
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review).get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return(" ".join(meaningful_words))

In [31]:
vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
def train():
    train_num_reviews = len(trainData["text"])
    clean_train_reviews = []
    for i in range(0, train_num_reviews):
        if((i+1) % 1000 == 0):
            print("Review %d of %d\n" % (i+1, train_num_reviews))
        clean_review = review_to_words(trainData["text"][i])
        clean_train_reviews.append(clean_review)
    train_data_features = vectorizer.fit_transform(clean_train_reviews)
    train_data_features = train_data_features.toarray()
    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit(train_data_features, trainData["polarity"])
    return forest


In [36]:
def test(forest):
    num_reviews = len(testData["text"])
    clean_test_reviews = []

    for i in range(0, num_reviews):
        if((i+1) % 1000 == 0):
            print("Review %d of %d\n" % (i+1, num_reviews))
        clean_review = review_to_words(testData["text"][i])
        clean_test_reviews.append(clean_review)

    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()
    result = forest.predict(test_data_features)
    output = pd.DataFrame(data={"id": testData["line_num"], "polarity": result})
    output.to_csv("16340157_lyh.tsv", index=False, quoting=3)


In [33]:
forest = train()

Review 1000 of 10026

Review 2000 of 10026

Review 3000 of 10026

Review 4000 of 10026

Review 5000 of 10026

Review 6000 of 10026

Review 7000 of 10026

Review 8000 of 10026

Review 9000 of 10026

Review 10000 of 10026



In [37]:
test(forest)

Review 1000 of 4850

Review 2000 of 4850

Review 3000 of 4850

Review 4000 of 4850

