# Thumbs up? Sentiment Classification using Machine Learning Techniques
by Bo Pang, Lillian Lee and Shivakumar Vaithyanathan

Read all files

In [1]:
from file_reader import FileReader

negPath = 'tokens/neg/'
posPath = 'tokens/pos/'

fileReader = FileReader()

negatives = fileReader.getTexts(negPath)
positives = fileReader.getTexts(posPath)
allTexts = negatives + positives

print('Negative:', len(negatives))
print('Positive:', len(positives))
print('Total:', len(allTexts))

N = len(negatives)

Negative: 700
Positive: 700
Total: 1400


Negation for unigram feature

In [2]:
from features import TextNegator

textNegator = TextNegator()

negatedNegatives = textNegator.getNegated(negatives)
negatedPositives = textNegator.getNegated(positives)

In [3]:
print(len(negatedNegatives))

700


Get unigrams of negated texts

In [4]:
from features import UnigramFeature

negatedTexts = negatedNegatives + negatedPositives

unigramFeature = UnigramFeature()
unigramFeature.process(negatedTexts)

In [5]:
from features import BigramFeature

bigramFeature = BigramFeature()
bigramFeature.process(allTexts)

In [6]:
nFold = 3
nPerFold = int(N/nFold)
print(nPerFold)

233


In [34]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [35]:
featuresNegative = unigramFeature.get(negatedNegatives, type='freq')
featuresPositive = unigramFeature.get(negatedPositives, type='freq')

In [37]:
print(featuresNegative)

[[ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 ..., 
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]]


In [36]:
from sklearn.metrics import accuracy_score

batches = {'batch': [], 'X': [], 'y': []}
for i in range(nFold):
    batches['batch'].append(i)
    batches['X'].append(np.concatenate((featuresNegative[nPerFold*i:nPerFold*i+nPerFold], featuresPositive[nPerFold*i:nPerFold*i+nPerFold])))
    batches['y'].append(np.append(np.zeros(nPerFold), np.ones(nPerFold)))

batches['batch'] = np.array(batches['batch'])
batches['X'] = np.array(batches['X'])
batches['y'] = np.array(batches['y'])

accuracies = {'nb': [], 'svm': []}
nbAccuracy = 0
svmAccuracy = 0
for i in range(nFold):
    trainX = batches['X'][batches['batch'] != i]
    trainX = trainX.reshape(trainX.shape[0]*trainX.shape[1],-1)
    trainY = batches['y'][batches['batch'] != i]
    trainY = np.ravel(trainY.reshape(trainY.shape[0]*trainY.shape[1],-1))
    
    testX = batches['X'][i]
    testY = batches['y'][i]
    
    nb = MultinomialNB()
    nb.fit(trainX, trainY)
    nbAccuracy += accuracy_score(nb.predict(testX), testY)

    svm = SVC()
    svm.fit(trainX, trainY)
    svmAccuracy += accuracy_score(svm.predict(testX), testY)
    
nbAccuracy /= nFold
print(nbAccuracy)
svmAccuracy /= nFold
print(svmAccuracy)

0.551502145923
0.555793991416


In [10]:
if 0 in trainX:
    print('pwet')
else:
    print('oryat')

oryat
