### Imports

In [1]:
import random
import nltk
import csv
from nltk.corpus import stopwords
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

### Read in the Train and Test .tsv from the directory

In [2]:
train = open("final_train.tsv", 'r')

In [3]:
test = open("final_test.tsv", 'r')

## Start cleaning the data

In [4]:
# loop over lines in the file and use the first limit of them
phrasedata = []
for x in train:
    # ignore the first line starting with Phrase and read all lines
    if (not x.startswith('Phrase')):
        x = x.strip()
        phrasedata.append(x.split('\t')[2:4])

In [5]:
phrasedata[:15]

[['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
  '1'],
 ['A series of escapades demonstrating the adage that what is good for the goose',
  '2'],
 ['A series', '2'],
 ['A', '2'],
 ['series', '2'],
 ['of escapades demonstrating the adage that what is good for the goose', '2'],
 ['of', '2'],
 ['escapades demonstrating the adage that what is good for the goose', '2'],
 ['escapades', '2'],
 ['demonstrating the adage that what is good for the goose', '2'],
 ['demonstrating the adage', '2'],
 ['demonstrating', '2'],
 ['the adage', '2'],
 ['the', '2'],
 ['adage', '2']]

In [6]:
 # pick a random sample of length limit because of phrase overlapping sequences
random.shuffle(phrasedata)
phraselist = phrasedata[:5000]

In [7]:
print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')

Read 156060 phrases, using 5000 random phrases


## Tokenize and Lower

In [8]:
# create list of phrase documents as (list of words, label)
phrasedocs = []

# add all the phrases

# each phrase has a list of tokens and the sentiment label (from 0 to 4)
# bin to only 3 categories for better performance
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))

In [9]:
# lowercase - each phrase is a pair consisting of a token list and a label
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append (lowerphrase)
# print a few
for phrase in docs[:10]:
    print (phrase)

(['the', 'boys', "'"], 2)
(['turbulent', 'times'], 2)
(['pandering'], 1)
(['of', 'the', 'year'], 2)
(['while', 'somewhat', 'less', 'than', 'it', 'might', 'have', 'been', ',', 'the', 'film', 'is', 'a', 'good', 'one', ',', 'and', 'you', "'ve", 'got', 'to', 'hand', 'it', 'to', 'director', 'george', 'clooney', 'for', 'biting', 'off', 'such', 'a', 'big', 'job', 'the', 'first', 'time', 'out', '.'], 3)
(['that', 'would', 'make', 'this', 'a', 'moving', 'experience', 'for', 'people', 'who', 'have', "n't", 'read', 'the', 'book'], 3)
(["'s", 'a', 'sharp', 'movie', 'about', 'otherwise', 'dull', 'subjects'], 3)
(['flaws', ',', 'but', 'also', 'stretches', 'of', 'impact', 'and', 'moments', 'of', 'awe'], 2)
(['you', "'re", 'depressed', 'about', 'anything', 'before', 'watching', 'this', 'film'], 2)
(['plot-lines'], 2)


## Create a list of words and word features

In [10]:
# continue as usual to get all words and create word features
all_words_list = [word for (sent,cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))

7379


In [13]:
# get the 1500 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1500)
word_features = [word for (word,count) in word_items]

In [16]:
for x in word_items[:25]:
    print(x[0], "\t", x[1])

the 	 1573
, 	 1302
a 	 1113
and 	 1049
of 	 1029
to 	 690
. 	 551
's 	 504
is 	 441
in 	 417
that 	 391
it 	 369
as 	 272
with 	 253
for 	 241
its 	 233
an 	 206
film 	 190
movie 	 190
you 	 172
but 	 170
this 	 167
on 	 155
be 	 145
his 	 125


## Create stopword list and remove stopwords

In [41]:
stopwords = nltk.corpus.stopwords.words('english')

In [42]:
stopwords.extend([',', '.',"'s", "'", '--', '``', '...', '`', '-rrb-', '-lrb-', ':'])

In [43]:
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [44]:
negationwords.extend(['ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'])

In [45]:
newstopwords = [word for word in stopwords if word not in negationwords]
new_all_words_list = [word for (sent,cat) in docs for word in sent if word not in newstopwords]

## Create new wordlist and features with the stopwords removed

In [46]:
# continue to define a new all words dictionary, get the 1500 most common as new_word_features
new_all_words = nltk.FreqDist(new_all_words_list)
new_word_items = new_all_words.most_common(1500)

In [47]:
new_word_features = [word for (word,count) in new_word_items]

In [49]:
for x in new_word_items[:25]:
    print(x[0], "\t\t", x[1])

film 		 190
movie 		 190
not 		 113
n't 		 106
like 		 103
one 		 102
story 		 73
comedy 		 62
much 		 62
good 		 61
characters 		 60
time 		 54
funny 		 54
make 		 53
love 		 49
even 		 49
never 		 48
us 		 48
way 		 46
work 		 46
no 		 44
little 		 43
audience 		 40
enough 		 39
movies 		 39


## Create Bigram Features

In [50]:
finder = BigramCollocationFinder.from_words(all_words_list)
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)

## Document Features Function

In [51]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

## Negation Features Function

In [52]:
def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = False
        features['V_NOT{}'.format(word)] = False
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features

## Part of Speech Features Function

In [53]:
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

## Cross Validation Function

In [54]:
def cross_validation_PRF(num_folds, featuresets, labels):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    # for the number of labels - start the totals lists with zeroes
    num_labels = len(labels)
    total_precision_list = [0] * num_labels
    total_recall_list = [0] * num_labels
    total_F1_list = [0] * num_labels

    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round to produce the gold and predicted labels
        goldlist = []
        predictedlist = []
        for (features, label) in test_this_round:
            goldlist.append(label)
            predictedlist.append(classifier.classify(features))

        # computes evaluation measures for this fold and
        #   returns list of measures for each label
        print('Fold', i)
        (precision_list, recall_list, F1_list) \
                  = eval_measures(goldlist, predictedlist, labels)
        for i in range(num_labels):
            # for each label, add the 3 measures to the 3 lists of totals
            total_precision_list[i] += precision_list[i]
            total_recall_list[i] += recall_list[i]
            total_F1_list[i] += F1_list[i]

    # find precision, recall and F measure averaged over all rounds for all labels
    # compute averages from the totals lists
    precision_list = [tot/num_folds for tot in total_precision_list]
    recall_list = [tot/num_folds for tot in total_recall_list]
    F1_list = [tot/num_folds for tot in total_F1_list]
    # the evaluation measures in a table with one row per label
    print('\nAverage Precision\tRecall\t\tF1 \tPer Label')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))
    
    # print macro average over all labels - treats each label equally
    print('\nMacro Average Precision\tRecall\t\tF1 \tOver All Labels')
    print('\t', "{:10.3f}".format(sum(precision_list)/num_labels), \
          "{:10.3f}".format(sum(recall_list)/num_labels), \
          "{:10.3f}".format(sum(F1_list)/num_labels))

    # for micro averaging, weight the scores for each label by the number of items
    #    this is better for labels with imbalance
    # first intialize a dictionary for label counts and then count them
    label_counts = {}
    for lab in labels:
      label_counts[lab] = 0 
    # count the labels
    for (doc, lab) in featuresets:
      label_counts[lab] += 1
    # make weights compared to the number of documents in featuresets
    num_docs = len(featuresets)
    label_weights = [(label_counts[lab] / num_docs) for lab in labels]
    print('\nLabel Counts', label_counts)
    #print('Label weights', label_weights)
    # print macro average over all labels
    print('Micro Average Precision\tRecall\t\tF1 \tOver All Labels')
    precision = sum([a * b for a,b in zip(precision_list, label_weights)])
    recall = sum([a * b for a,b in zip(recall_list, label_weights)])
    F1 = sum([a * b for a,b in zip(F1_list, label_weights)])
    print( '\t', "{:10.3f}".format(precision), \
      "{:10.3f}".format(recall), "{:10.3f}".format(F1))
    

## Evaluation Measures Function

In [56]:
def eval_measures(gold, predicted, labels):
    
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []

    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        # for small numbers, guard against dividing by zero in computing measures
        if (TP == 0) or (FP == 0) or (FN == 0):
          recall_list.append (0)
          precision_list.append (0)
          F1_list.append(0)
        else:
          recall = TP / (TP + FP)
          precision = TP / (TP + FN)
          recall_list.append(recall)
          precision_list.append(precision)
          F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    return (precision_list, recall_list, F1_list)

## Bigram Document Features Function

In [57]:
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [58]:
  # feature sets from feature definition functions above
featuresets = [(document_features(d, word_features), c) for (d, c) in docs]
negfeaturesets = [(NOT_features(d, word_features, negationwords), c) for (d, c) in docs]
bigramfeaturesets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in docs]
POSfeaturesets = [(POS_features(d, new_word_features), c) for (d, c) in docs]
# train classifier and show performance in cross-validation
# make a list of labels
label_list = [c for (d,c) in docs]
labels = list(set(label_list))    # gets only unique labels
num_folds = 10

In [68]:
featuresets[:1]

[({'V_the': True,
   'V_,': False,
   'V_a': False,
   'V_and': False,
   'V_of': False,
   'V_to': False,
   'V_.': False,
   "V_'s": False,
   'V_is': False,
   'V_in': False,
   'V_that': False,
   'V_it': False,
   'V_as': False,
   'V_with': False,
   'V_for': False,
   'V_its': False,
   'V_an': False,
   'V_film': False,
   'V_movie': False,
   'V_you': False,
   'V_but': False,
   'V_this': False,
   'V_on': False,
   'V_be': False,
   'V_his': False,
   'V_more': False,
   'V_has': False,
   'V_--': False,
   'V_by': False,
   'V_not': False,
   'V_than': False,
   'V_at': False,
   "V_n't": False,
   "V_'": True,
   'V_all': False,
   'V_like': False,
   'V_one': False,
   'V_about': False,
   'V_or': False,
   'V_from': False,
   'V_have': False,
   'V_``': False,
   'V_are': False,
   'V_-rrb-': False,
   'V_so': False,
   'V_most': False,
   'V_-lrb-': False,
   'V_`': False,
   'V_story': False,
   'V_out': False,
   'V_their': False,
   'V_comedy': False,
   'V_much': Fa

In [69]:
negfeaturesets[:1]

[({'V_the': True,
   'V_NOTthe': False,
   'V_,': False,
   'V_NOT,': False,
   'V_a': False,
   'V_NOTa': False,
   'V_and': False,
   'V_NOTand': False,
   'V_of': False,
   'V_NOTof': False,
   'V_to': False,
   'V_NOTto': False,
   'V_.': False,
   'V_NOT.': False,
   "V_'s": False,
   "V_NOT's": False,
   'V_is': False,
   'V_NOTis': False,
   'V_in': False,
   'V_NOTin': False,
   'V_that': False,
   'V_NOTthat': False,
   'V_it': False,
   'V_NOTit': False,
   'V_as': False,
   'V_NOTas': False,
   'V_with': False,
   'V_NOTwith': False,
   'V_for': False,
   'V_NOTfor': False,
   'V_its': False,
   'V_NOTits': False,
   'V_an': False,
   'V_NOTan': False,
   'V_film': False,
   'V_NOTfilm': False,
   'V_movie': False,
   'V_NOTmovie': False,
   'V_you': False,
   'V_NOTyou': False,
   'V_but': False,
   'V_NOTbut': False,
   'V_this': False,
   'V_NOTthis': False,
   'V_on': False,
   'V_NOTon': False,
   'V_be': False,
   'V_NOTbe': False,
   'V_his': False,
   'V_NOThis': Fal

In [70]:
bigramfeaturesets[:1]

[({'V_the': True,
   'V_,': False,
   'V_a': False,
   'V_and': False,
   'V_of': False,
   'V_to': False,
   'V_.': False,
   "V_'s": False,
   'V_is': False,
   'V_in': False,
   'V_that': False,
   'V_it': False,
   'V_as': False,
   'V_with': False,
   'V_for': False,
   'V_its': False,
   'V_an': False,
   'V_film': False,
   'V_movie': False,
   'V_you': False,
   'V_but': False,
   'V_this': False,
   'V_on': False,
   'V_be': False,
   'V_his': False,
   'V_more': False,
   'V_has': False,
   'V_--': False,
   'V_by': False,
   'V_not': False,
   'V_than': False,
   'V_at': False,
   "V_n't": False,
   "V_'": True,
   'V_all': False,
   'V_like': False,
   'V_one': False,
   'V_about': False,
   'V_or': False,
   'V_from': False,
   'V_have': False,
   'V_``': False,
   'V_are': False,
   'V_-rrb-': False,
   'V_so': False,
   'V_most': False,
   'V_-lrb-': False,
   'V_`': False,
   'V_story': False,
   'V_out': False,
   'V_their': False,
   'V_comedy': False,
   'V_much': Fa

In [71]:
POSfeaturesets[:1]

[({'contains(film)': False,
   'contains(movie)': False,
   'contains(not)': False,
   "contains(n't)": False,
   'contains(like)': False,
   'contains(one)': False,
   'contains(story)': False,
   'contains(comedy)': False,
   'contains(much)': False,
   'contains(good)': False,
   'contains(characters)': False,
   'contains(time)': False,
   'contains(funny)': False,
   'contains(make)': False,
   'contains(love)': False,
   'contains(even)': False,
   'contains(never)': False,
   'contains(us)': False,
   'contains(way)': False,
   'contains(work)': False,
   'contains(no)': False,
   'contains(little)': False,
   'contains(audience)': False,
   'contains(enough)': False,
   'contains(movies)': False,
   'contains(director)': False,
   'contains(people)': False,
   'contains(films)': False,
   'contains(plot)': False,
   'contains(could)': False,
   'contains(life)': False,
   'contains(new)': False,
   'contains(something)': False,
   'contains(action)': False,
   'contains(many)':

In [74]:
#Unigram or Bag-of-Words (BOW) features; this is the baseline:
print("\nOriginal Featureset")
cross_validation_PRF(num_folds, featuresets, labels)
#Bigrams
print("\nBigrams Featureset")
cross_validation_PRF(num_folds, bigramfeaturesets, labels)
#Negation -- SHOULD BE CREATED
print("\nNegated Featureset")
cross_validation_PRF(num_folds, negfeaturesets, labels)
#POS features -- SHOULD BE CREATED
print("\nPOS Featureset")
cross_validation_PRF(num_folds, POSfeaturesets, labels)


Original Featureset
Each fold size: 500
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

Average Precision	Recall		F1 	Per Label
0 	      0.137      0.145      0.139
1 	      0.218      0.333      0.262
2 	      0.825      0.612      0.702
3 	      0.257      0.430      0.321
4 	      0.197      0.308      0.236

Macro Average Precision	Recall		F1 	Over All Labels
	      0.327      0.366      0.332

Label Counts {0: 226, 1: 895, 2: 2491, 3: 1102, 4: 286}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.524      0.483      0.487

Bigrams Featureset
Each fold size: 500
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

Average Precision	Recall		F1 	Per Label
0 	      0.137      0.145      0.139
1 	      0.218      0.333      0.262
2 	      0.825      0.612      0.702
3 	      0.257      0.430      0.321
4 	      0.197      0.308      0.236

Macro Average Precision	Recall		F1 	Over All Labels
	      0.327      0.366      0.332

Label 