In [33]:
from nltk.util import ngrams
from nltk.corpus import stopwords as sw
import re
import pickle

If using a pre-assembled dataset from `saved_sets`, skip to "Train and Test" cell.

# Fetch Raw Data

In [34]:
orderFile = open("order.pkl","rb")
randomOrder = pickle.load(orderFile)
orderFile.close()

positiveTrainFile = open("aclImdb/train_pos.txt","r")
negativeTrainFile = open("aclImdb/train_neg.txt","r")
positiveTestFile = open("aclImdb/test_pos.txt","r")
negativeTestFile = open("aclImdb/test_neg.txt","r")

positiveTrainText = re.sub("(  +)|(<.*>)", " ", re.sub(":|,|;|/|<br >|\"|\.|\(|\)|\*", "", positiveTrainFile.read().lower()))
negativeTrainText = re.sub("(  +)|(<.*>)", " ", re.sub(":|,|;|/|<br >|\"|\.|\(|\)|\*", "", negativeTrainFile.read().lower()))
positiveTestText = re.sub("(  +)|(<.*>)", " ", re.sub(":|,|;|/|<br >|\"|\.|\(|\)|\*", "", positiveTestFile.read().lower()))
negativeTestText = re.sub("(  +)|(<.*>)", " ", re.sub(":|,|;|/|<br >|\"|\.|\(|\)|\*", "", negativeTestFile.read().lower()))

positiveTrainFile.close()
negativeTrainFile.close()
positiveTestFile.close()
negativeTestFile.close()

# Feature Engineer and Format

In [35]:
trainLines = ["1 | " + line for line in positiveTrainText.split("\n")[:-1]] + ["-1 | " + line for line in negativeTrainText.split("\n")[:-1]]
testLines = ["1 | " + line for line in positiveTestText.split("\n")[:-1]] + ["-1 | " + line for line in negativeTestText.split("\n")[:-1]]
stopwords = sw.words('english')
nGramN = 1
binaryFeatures = True
removeStopwords = False
addLengthFeature = True
addICount = False
addExclaimCount = False
addFilmCount = False

for lineSet in [trainLines, testLines]:
    for i,line in enumerate(lineSet):
        features = line.split(" | ")[1].split(" ")
        nGrams = []
        for j in range(1,nGramN + 1):
            nGrams = nGrams + list(ngrams(features, j))
        separator = "_"
        nGrams = [separator.join(gram) for gram in nGrams]
        if removeStopwords:
            nGrams = [gram for gram in nGrams if any([word not in stopwords for word in gram.split(separator)])]
        features = nGrams
        if binaryFeatures:
            if addICount:
                nI = features.count("i")
                features.append("numberOfI:" + str(nI))
            if addExclaimCount:
                nExclaim = 0
                for feature in features:
                    if "!" in feature:
                        nExclaim += 1
                features.append("numberOfExclaim:" + str(nExclaim))
            if addFilmCount:
                nFilm = features.count("film")
                features.insert(0,"numberOfFilm:" + str(nFilm))
        line = line.split(" | ")[0] + " | " + " ".join(features)
        if not binaryFeatures:
            counts = {}
            for gram in nGrams:
                if gram in counts:
                    counts[gram] += 1
                else:
                    counts[gram] = 1
            nonBinaryFeatures = [ f'{gram}:{counts[gram]}' for gram in counts]
            features = nonBinaryFeatures
        if addLengthFeature:
            l = len(features)
            features.append("length:" + str(l))
        line = line.split(" | ")[0] + " | " + " ".join(features)
        if lineSet == trainLines:
            trainLines[i] = line
        elif lineSet == testLines:
            testLines[i] = line

tmp = []
for i in randomOrder:
    tmp.append(trainLines[i])
trainLines = tmp
tmp = []
for i in randomOrder:
    tmp.append(testLines[i])
testLines = tmp

trainFile = open("train.vw","w")
testFile = open("test.vw","w")

trainFile.write("\n".join(trainLines))
testFile.write("\n".join(testLines))

trainFile.close()
testFile.close()

# Train and Test

The parameters set below yield the best accuracy I've been able to get. It uses bigrams and mostly binary features. Noticing that negative reviews tend to be shorter than positive reviews, I added a `length` feature that holds the length of the review. This didn't improve accuracy by much (only 0.32%), but I kept it anyway. I also found that accuracy improved up until the 7th pass.

In [36]:
trainDataPath = "train.vw"
testDataPath = "test.vw"
nGramLength = 2
nPasses = 7

!rm sentiment.model
!rm .cache
!vw --random_seed 1 --ngram {nGramLength} --l2 0 --cache --final_regressor sentiment.model --loss_function logistic --passes {nPasses} < {trainDataPath} &> /dev/null
!vw --testonly -i sentiment.model --predictions predictions.txt --binary  < {testDataPath}
with open("predictions.txt","r") as predictionFile:
    predictions = predictionFile.read().split("\n")
with open(testDataPath,"r") as testFile:
    labels = [line.split(" | ")[0] for line in testFile.read().split("\n")]
nCorrect = 0
for i,prediction in enumerate(predictions[:-1]):
    if prediction == labels[i]:
        nCorrect += 1
accuracy = nCorrect / len(predictions)
print("Accuracy:", accuracy)


[info] Generating 2-grams for all namespaces.
only testing
predictions = predictions.txt
using no cache
Reading datafile = stdin
num sources = 1
Num weight bits = 18
learning rate = 0.5
initial_t = 175000
power_t = 0.5
Enabled reductions: gd, scorer-identity, binary, count_label
Input label = simple
Output pred = scalar
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0  -1.0000  -1.0000       34
0.000000 0.000000            2            2.0   1.0000   1.0000      536
0.000000 0.000000            4            4.0  -1.0000  -1.0000      268
0.000000 0.000000            8            8.0   1.0000   1.0000      202
0.062500 0.125000           16           16.0   1.0000   1.0000      304
0.062500 0.062500           32           32.0  -1.0000   1.0000      186
0.140625 0.218750           64           64.0   1.0000   1.0000      108
0.132812 0.125000    

Accuracy:
- (BASELINE) Binary, n=1, data-ID=8: 0.8529258829646814
- (BIGRAM) Binary, n=2 (CLI arg), data-ID=8: 0.8636054557817687
- (BIGRAM) Binary, n=2 (NLTK), data-ID=9: 0.8425262989480421
- (WORDCOUNT) Binary, n=1, data-ID=10: 0.8561257549698013
- (NGRAM_COUNT) Binary, n=2 (NLTK), data-ID=11: 0.8452861885524579
- (STOPWORDS REMOVED) Binary, n=1, data-ID=12: 0.8516859325626975
- (STOPWORDS REMOVED BIGRAM) Binary, n=2 (CLI arg), data-ID=13: 0.8542458301667933
- (BIGRAM x7) Binary, n=2 (CLI arg), data-ID=14, passes=7: 0.8636054557817687
- (BIGRAM WITH LENGTH FEATURE x7) Binary, n=2 (CLI arg), data-ID=15, passes=7: 0.8668053277868886

`data-ID` attribute above refers to a version of the train/test sets in the `saved_sets` directory