# Thumbs up? Sentiment Classification using Machine Learning Techniques
by Bo Pang, Lillian Lee and Shivakumar Vaithyanathan

In [47]:
%reload_ext autoreload
%autoreload 2

Read all files

In [1]:
from file_reader import FileReader

negPath = 'mix20_rand700_tokens_cleaned/tokens/neg/'
posPath = 'mix20_rand700_tokens_cleaned/tokens/pos/'

fileReader = FileReader()

negatives = fileReader.getTexts(negPath)
positives = fileReader.getTexts(posPath)
allTexts = negatives + positives

print('Negative:', len(negatives))
print('Positive:', len(positives))
print('Total:', len(allTexts))

N = len(negatives)

Negative: 700
Positive: 700
Total: 1400


In [2]:
nFold = 3
nPerFold = int(N/nFold)
print(nPerFold)

233


Negation for unigram feature

In [81]:
from features import TextNegator

textNegator = TextNegator()

negatedNegatives = textNegator.getNegated(negatives)
negatedPositives = textNegator.getNegated(positives)

In [4]:
print(len(negatedNegatives))

700


Get unigrams of negated texts

In [5]:
from features import UnigramFeature

In [6]:
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from k_fold import KFoldBatcher

In [7]:
from sklearn.model_selection import KFold

kfold = KFold(nFold)

In [9]:
results = {'features':[], 'nFeatures': [],'nb': [], 'svm': []}

nbAccuracy = 0
svmAccuracy = 0
nFeatures = 0
i = 0
for trainIndex, testIndex in kfold.split(negatives):
    unigramFeature = UnigramFeature()
    unigramFeature.process([negatedNegatives[index] for index in trainIndex] + [negatedPositives[index] for index in trainIndex])
    nFeatures += len(unigramFeature.unigrams)
    
    featuresNegative = unigramFeature.get(negatedNegatives, type='freq')
    featuresPositive = unigramFeature.get(negatedPositives, type='freq')
    
    kfoldBatcher = KFoldBatcher(nFold, featuresNegative, featuresPositive)
    trainX = kfoldBatcher.getTrainX(i)
    trainY = kfoldBatcher.getTrainY(i)
    
    testX = kfoldBatcher.getTestX(i)
    testY = kfoldBatcher.getTestY(i)
    
    nb = BernoulliNB()
    nb.fit(trainX, trainY)
    nbAccuracy += accuracy_score(nb.predict(testX), testY)

    svm = LinearSVC()
    svm.fit(trainX, trainY)
    svmAccuracy += accuracy_score(svm.predict(testX), testY)
    
    i += 1
    
nbAccuracy /= nFold
svmAccuracy /= nFold
nFeatures = int(nFeatures/nFold)
results['features'].append('unigrams')
results['nFeatures'].append(nFeatures)
results['nb'].append(nbAccuracy)
results['svm'].append(svmAccuracy)

print('Features: unigrams frequency')
print('Number of Features:', nFeatures)
print('Naive Bayes Accuracy:', nbAccuracy)
print('SVM Accuracy:', svmAccuracy)

Features: unigrams frequency
Number of Features: 15479
Naive Bayes Accuracy: 0.761802575107
SVM Accuracy: 0.786123032904


In [11]:
unigramFeaturesNegative = []
unigramFeaturesPositive = []

nbAccuracy = 0
svmAccuracy = 0
nFeatures = 0
i = 0
for trainIndex, testIndex in kfold.split(negatives):
    unigramFeature = UnigramFeature()
    unigramFeature.process([negatedNegatives[index] for index in trainIndex] + [negatedPositives[index] for index in trainIndex])
    nFeatures += len(unigramFeature.unigrams)
    
    featuresNegative = unigramFeature.get(negatedNegatives, type='pres')
    featuresPositive = unigramFeature.get(negatedPositives, type='pres')
    
    unigramFeaturesNegative.append(featuresNegative)
    unigramFeaturesPositive.append(featuresPositive)

    kfoldBatcher = KFoldBatcher(nFold, featuresNegative, featuresPositive)
    
    trainX = kfoldBatcher.getTrainX(i)
    trainY = kfoldBatcher.getTrainY(i)
    
    testX = kfoldBatcher.getTestX(i)
    testY = kfoldBatcher.getTestY(i)
    
    nb = BernoulliNB()
    nb.fit(trainX, trainY)
    nbAccuracy += accuracy_score(nb.predict(testX), testY)

    svm = LinearSVC()
    svm.fit(trainX, trainY)
    svmAccuracy += accuracy_score(svm.predict(testX), testY)
    
    i += 1
    
nbAccuracy /= nFold
svmAccuracy /= nFold
nFeatures = int(nFeatures/nFold)
results['features'].append('unigrams')
results['nFeatures'].append(nFeatures)
results['nb'].append(nbAccuracy)
results['svm'].append(svmAccuracy)

print('Features: unigrams presence')
print('Number of Features:', nFeatures)
print('Naive Bayes Accuracy:', nbAccuracy)
print('SVM Accuracy:', svmAccuracy)

Features: unigrams presence
Number of Features: 15479
Naive Bayes Accuracy: 0.761802575107
SVM Accuracy: 0.804721030043


Bigrams

In [12]:
from features import BigramFeature

In [21]:
bigramFeaturesNegative = []
bigramFeaturesPositive = []

nbAccuracy = 0
svmAccuracy = 0
nFeatures = 0
i = 0
for trainIndex, testIndex in kfold.split(negatives):
    bigramFeature = BigramFeature()
    bigramFeature.process([negatives[index] for index in trainIndex] + [positives[index] for index in trainIndex])
    nFeatures += len(bigramFeature.bigrams)
    
    featuresNegative = bigramFeature.get(negatives)
    featuresPositive = bigramFeature.get(positives)
    
    bigramFeaturesNegative.append(featuresNegative)
    bigramFeaturesPositive.append(featuresPositive)

    kfoldBatcher = KFoldBatcher(nFold, featuresNegative, featuresPositive)
    
    trainX = kfoldBatcher.getTrainX(i)
    trainY = kfoldBatcher.getTrainY(i)
    
    testX = kfoldBatcher.getTestX(i)
    testY = kfoldBatcher.getTestY(i)
    
    nb = BernoulliNB()
    nb.fit(trainX, trainY)
    nbAccuracy += accuracy_score(nb.predict(testX), testY)

    svm = LinearSVC()
    svm.fit(trainX, trainY)
    svmAccuracy += accuracy_score(svm.predict(testX), testY)
    
    i += 1
    
nbAccuracy /= nFold
svmAccuracy /= nFold
nFeatures = int(nFeatures/nFold)
results['features'].append('bigrams')
results['nFeatures'].append(nFeatures)
results['nb'].append(nbAccuracy)
results['svm'].append(svmAccuracy)

print('Features: bigrams')
print('Number of Features:', nFeatures)
print('Naive Bayes Accuracy:', nbAccuracy)
print('SVM Accuracy:', svmAccuracy)

Features: bigrams
Number of Features: 0
Naive Bayes Accuracy: 0.0
SVM Accuracy: 0.0


Unigrams and Bigrams

In [18]:
bigramFeaturesNegative = []
bigramFeaturesPositive = []

nbAccuracy = 0
svmAccuracy = 0
nFeatures = 0
for i in range(nFold):
    print(i)
    featuresNegative = np.concatenate((unigramFeaturesNegative[i], bigramFeaturesNegative[i]), axis=1)
    featuresPositive = np.concatenate((unigramFeaturesPositive[i], bigramFeaturesPositive[i]), axis=1)

    kfoldBatcher = KFoldBatcher(nFold, featuresNegative, featuresPositive)
    
    trainX = kfoldBatcher.getTrainX(i)
    trainY = kfoldBatcher.getTrainY(i)
    
    testX = kfoldBatcher.getTestX(i)
    testY = kfoldBatcher.getTestY(i)
    
    nb = BernoulliNB()
    nb.fit(trainX, trainY)
    nbAccuracy += accuracy_score(nb.predict(testX), testY)

    svm = LinearSVC()
    svm.fit(trainX, trainY)
    svmAccuracy += accuracy_score(svm.predict(testX), testY)
    
nbAccuracy /= nFold
svmAccuracy /= nFold
nFeatures = int(nFeatures/nFold)
results['features'].append('unigrams+bigrams')
results['nFeatures'].append(nFeatures)
results['nb'].append(nbAccuracy)
results['svm'].append(svmAccuracy)

print('Features: unigrams+bigrams presence')
print('Number of Features:', nFeatures)
print('Naive Bayes Accuracy:', nbAccuracy)
print('SVM Accuracy:', svmAccuracy)

0


IndexError: list index out of range

Adjectives

In [34]:
from features import UnigramPOSFeature
from features import POSTagger

posTagger = POSTagger()
posNegatives = posTagger.getPOS(negatives)
posPositives = posTagger.getPOS(positives)

In [23]:
from features import AdjectiveFeature

In [41]:
nbAccuracy = 0
svmAccuracy = 0
nFeatures = 0
i = 0
for trainIndex, testIndex in kfold.split(negatives):
    adjFeature = AdjectiveFeature()
    textsTrain = [negatives[index] for index in trainIndex] + [positives[index] for index in trainIndex]
    posTextsTrain = [posNegatives[index] for index in trainIndex] + [posPositives[index] for index in trainIndex]
    adjFeature.process(textsTrain, posTextsTrain)
    nFeatures += len(adjFeature.adjectives)
    
    featuresNegative = adjFeature.get(negatives)
    featuresPositive = adjFeature.get(positives)

    kfoldBatcher = KFoldBatcher(nFold, featuresNegative, featuresPositive)
    
    trainX = kfoldBatcher.getTrainX(i)
    trainY = kfoldBatcher.getTrainY(i)
    
    testX = kfoldBatcher.getTestX(i)
    testY = kfoldBatcher.getTestY(i)
    
    nb = BernoulliNB()
    nb.fit(trainX, trainY)
    nbAccuracy += accuracy_score(nb.predict(testX), testY)

    svm = LinearSVC()
    svm.fit(trainX, trainY)
    svmAccuracy += accuracy_score(svm.predict(testX), testY)
    
    i += 1
    
nbAccuracy /= nFold
svmAccuracy /= nFold
nFeatures = int(nFeatures/nFold)
results['features'].append('adjectives')
results['nFeatures'].append(nFeatures)
results['nb'].append(nbAccuracy)
results['svm'].append(svmAccuracy)

print('Features: adjectives')
print('Number of Features:', nFeatures)
print('Naive Bayes Accuracy:', nbAccuracy)
print('SVM Accuracy:', svmAccuracy)

Features: bigrams
Number of Features: 12320
Naive Bayes Accuracy: 0.776108726753
SVM Accuracy: 0.760371959943


Unigrams + POS

In [89]:
nbAccuracy = 0
svmAccuracy = 0
nFeatures = 0
i = 0
for trainIndex, testIndex in kfold.split(negatives):
    unigramPOSFeature = UnigramPOSFeature()
    negatedTextsTrain = [negatedNegatives[index] for index in trainIndex] + [negatedPositives[index] for index in trainIndex]
    posTextsTrain = [posNegatives[index] for index in trainIndex] + [posPositives[index] for index in trainIndex]
    unigramPOSFeature.process(negatedTextsTrain, posTextsTrain)
    nFeatures += len(unigramPOSFeature.unigrams)
    
    featuresNegative = unigramPOSFeature.get(negatedNegatives, posNegatives)
    featuresPositive = unigramPOSFeature.get(negatedPositives, posPositives)

    kfoldBatcher = KFoldBatcher(nFold, featuresNegative, featuresPositive)
    
    trainX = kfoldBatcher.getTrainX(i)
    trainY = kfoldBatcher.getTrainY(i)
    
    testX = kfoldBatcher.getTestX(i)
    testY = kfoldBatcher.getTestY(i)
    
    nb = BernoulliNB()
    nb.fit(trainX, trainY)
    nbAccuracy += accuracy_score(nb.predict(testX), testY)

    svm = LinearSVC()
    svm.fit(trainX, trainY)
    svmAccuracy += accuracy_score(svm.predict(testX), testY)
    
    i += 1
    
nbAccuracy /= nFold
svmAccuracy /= nFold
nFeatures = int(nFeatures/nFold)
results['features'].append('unigrams+POS')
results['nFeatures'].append(nFeatures)
results['nb'].append(nbAccuracy)
results['svm'].append(svmAccuracy)

print('Features: unigrams+POS')
print('Number of Features:', nFeatures)
print('Naive Bayes Accuracy:', nbAccuracy)
print('SVM Accuracy:', svmAccuracy)

Features: unigrams+POS
Number of Features: 22
Naive Bayes Accuracy: 0.585836909871
SVM Accuracy: 0.581545064378


In [38]:
print(unigramPOSFeature.unigrams)

['.--.', 'html--NN', '!--.', 'com--NN', 'com/--NN', 'michael%20redman--NN', 'com/ukcritic--JJ', '?--.', '|--NN', ')--)', 'bloom--NN', 'org/ejahiel--NN', '"--NN', ')--n--)', '"--n--VBP', 'edu/~jpeck1/--NN', '"--n--NN', 'com/~mmapes/--NN', 'com/page/teenagemoviecritic--JJ', '"--VB', '*--NN', 'net/~drsuess/--NN', 'com/film/--NN']


Top 2633 Unigrams

In [43]:
nbAccuracy = 0
svmAccuracy = 0
nFeatures = 0
for i in range(nFold):
    featuresNegative = unigramFeaturesNegative[i][:2633]
    featuresPositive = unigramFeaturesPositive[i][:2633]

    kfoldBatcher = KFoldBatcher(nFold, featuresNegative, featuresPositive)
    
    trainX = kfoldBatcher.getTrainX(i)
    trainY = kfoldBatcher.getTrainY(i)
    
    testX = kfoldBatcher.getTestX(i)
    testY = kfoldBatcher.getTestY(i)
    
    nb = BernoulliNB()
    nb.fit(trainX, trainY)
    nbAccuracy += accuracy_score(nb.predict(testX), testY)

    svm = LinearSVC()
    svm.fit(trainX, trainY)
    svmAccuracy += accuracy_score(svm.predict(testX), testY)
    
nbAccuracy /= nFold
svmAccuracy /= nFold
nFeatures = 2633
results['features'].append('top 2633 unigrams')
results['nFeatures'].append(nFeatures)
results['nb'].append(nbAccuracy)
results['svm'].append(svmAccuracy)

print('Features: top 2633 unigrams')
print('Number of Features:', nFeatures)
print('Naive Bayes Accuracy:', nbAccuracy)
print('SVM Accuracy:', svmAccuracy)

Features: unigrams+bigrams presence
Number of Features: 0
Naive Bayes Accuracy: 0.761802575107
SVM Accuracy: 0.804721030043


Unigrams + position

In [82]:
from features import PositionTagger

positionTagger = PositionTagger()
positionNegatives = positionTagger.getPositions(negatedNegatives)
positionPositives = positionTagger.getPositions(negatedPositives)

In [83]:
from features import UnigramPositionFeature

In [88]:
nbAccuracy = 0
svmAccuracy = 0
nFeatures = 0
i = 0
for trainIndex, testIndex in kfold.split(negatives):
    unigramPositionFeature = UnigramPositionFeature()
    negatedTextsTrain = [negatedNegatives[index] for index in trainIndex] + [negatedPositives[index] for index in trainIndex]
    positionTextsTrain = [positionNegatives[index] for index in trainIndex] + [positionPositives[index] for index in trainIndex]
    print(len(negatedTextsTrain))
    unigramPositionFeature.process(negatedTextsTrain, positionTextsTrain)
    nFeatures += len(unigramPositionFeature.unigrams)
    
    featuresNegative = unigramPositionFeature.get(negatedNegatives, positionNegatives)
    featuresPositive = unigramPositionFeature.get(negatedPositives, positionPositives)

    kfoldBatcher = KFoldBatcher(nFold, featuresNegative, featuresPositive)
    
    trainX = kfoldBatcher.getTrainX(i)
    trainY = kfoldBatcher.getTrainY(i)
    
    testX = kfoldBatcher.getTestX(i)
    testY = kfoldBatcher.getTestY(i)
    
    nb = BernoulliNB()
    nb.fit(trainX, trainY)
    nbAccuracy += accuracy_score(nb.predict(testX), testY)

    svm = LinearSVC()
    svm.fit(trainX, trainY)
    svmAccuracy += accuracy_score(svm.predict(testX), testY)
    
    i += 1
    
nbAccuracy /= nFold
svmAccuracy /= nFold
nFeatures = int(nFeatures/nFold)
results['features'].append('unigrams+bigrams')
results['nFeatures'].append(nFeatures)
results['nb'].append(nbAccuracy)
results['svm'].append(svmAccuracy)

print('Features: unigrams+position')
print('Number of Features:', nFeatures)
print('Naive Bayes Accuracy:', nbAccuracy)
print('SVM Accuracy:', svmAccuracy)

932
934
934
Features: unigrams+position
Number of Features: 60
Naive Bayes Accuracy: 0.582975679542
SVM Accuracy: 0.590844062947
