In [1]:
from glob import glob
from collections import Counter, defaultdict
import os
from nltk.tokenize import sent_tokenize, word_tokenize

# Loading data #

In [2]:
pos_thresh = 7
neg_thresh = 3

The next line references the directory "train-imbd". You can get this data [here](https://github.com/jacobeisenstein/gt-nlp-class/releases/download/imbd-fall-2015/imbd-data.tgz)

In [3]:
x = [] # list of counters
y = [] # list of ints
for filename in glob('train-imdb/*.txt'):
    basename = os.path.basename(filename)
    score = int(basename.split('_')[1].split('.')[0])
    if score >= pos_thresh or score <= neg_thresh:
        with open(filename) as fin:
            counts = Counter()
            for line in fin:
                try:
                    for sent in sent_tokenize(line):
                        for word in word_tokenize(sent):
                            counts[word.lower()] += 1 #try it without downcasing!
                except:
                    pass
            if len(counts)> 0:
                x.append(counts)
                if score >= pos_thresh:
                    y.append(1)
                else: y.append(-1)

# Making predictions #

The following function returns the total count of all feats specified in the "feats" list.

In [4]:
score_doc = lambda counts, feats : sum([counts[feat] for feat in feats])

In [5]:
score_doc(x[10],['excellent','striking','brilliant'])

1

In [6]:
# make a prediction, using two word lists
def makePreds(counts,pos_words,neg_words):
    preds = []
    for doc in x:
        if score_doc(doc,pos_words) > score_doc(doc,neg_words):
            preds.append(1)
        elif score_doc(doc,pos_words) < score_doc(doc,neg_words):
            preds.append(-1)
        else: preds.append(0) # don't know!
    return preds

In [7]:
preds = makePreds(x,['excellent','striking','brilliant'],['boring','bad','terrible'])

In [8]:
print 'accuracy', sum([(pred==label) for pred,label in zip(preds,y)])/float(len(y))

accuracy 0.287167891464


In [14]:
print 'no prediction', sum([pred==0 for pred in preds]) / float(len(preds))

no prediction 0.624646693047


# Precision and recall #

In [15]:
# true positives: correctly predicted to be positive
true_pos = sum([(pred==label and pred==1) for pred,label in zip(preds,y)])

In [16]:
pred_pos = sum([pred==1 for pred in preds])

In [17]:
print 'precision(+)', true_pos / float(pred_pos)

precision(+) 0.9


In [18]:
tot_pos = sum([label==1 for label in y])

In [19]:
print 'recall(+)', true_pos/float(tot_pos)

recall(+) 0.154858299595


In [20]:
true_neg = sum([(pred==label and pred==-1) for pred,label in zip(preds,y)])

In [21]:
pred_neg = sum([pred==-1 for pred in preds])

In [22]:
print 'precision(-)', true_neg / float(pred_neg)

precision(-) 0.718623481781


In [23]:
tot_neg = sum([label==-1 for label in y])

In [24]:
print 'recall(-)', true_neg/float(tot_neg)

recall(-) 0.454545454545
