In [2]:
 # James D. Harris - IST 664 Final Project

In [111]:
import os
import sys
import random
import time
import pandas
import re

import nltk
from nltk.corpus import subjectivity
from nltk.corpus import sentence_polarity
from nltk.corpus import stopwords
from nltk.corpus.reader.api import *
from nltk.tokenize import *
from nltk.collocations import *
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

In [126]:
# Set directory path

dirPath = 'C:/Users/Jack Harris/! Syracuse Python/IST 664/kagglemoviereviews/corpus'

In [16]:
# Function: define features

def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [23]:
# Function: cross-validation

def cross_validation_PRF(num_folds, featuresets, labels):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    # for the number of labels - start the totals lists with zeroes
    num_labels = len(labels)
    total_precision_list = [0] * num_labels
    total_recall_list = [0] * num_labels
    total_F1_list = [0] * num_labels

    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round to produce the gold and predicted labels
        goldlist = []
        predictedlist = []
        for (features, label) in test_this_round:
            goldlist.append(label)
            predictedlist.append(classifier.classify(features))

        # computes evaluation measures for this fold and
        #   returns list of measures for each label
        print('Fold', i)
        (precision_list, recall_list, F1_list) \
                  = eval_measures(goldlist, predictedlist, labels)
        # take off triple string to print precision, recall and F1 for each fold
        '''
        print('\tPrecision\tRecall\t\tF1')
        # print measures for each label
        for i, lab in enumerate(labels):
            print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
              "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))
        '''
        # for each label add to the sums in the total lists
        for i in range(num_labels):
            # for each label, add the 3 measures to the 3 lists of totals
            total_precision_list[i] += precision_list[i]
            total_recall_list[i] += recall_list[i]
            total_F1_list[i] += F1_list[i]

    # find precision, recall and F measure averaged over all rounds for all labels
    # compute averages from the totals lists
    precision_list = [tot/num_folds for tot in total_precision_list]
    recall_list = [tot/num_folds for tot in total_recall_list]
    F1_list = [tot/num_folds for tot in total_F1_list]
    # the evaluation measures in a table with one row per label
    print('\nAverage Precision\tRecall\t\tF1 \tPer Label')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))
    
    # print macro average over all labels - treats each label equally
    print('\nMacro Average Precision\tRecall\t\tF1 \tOver All Labels')
    print('\t', "{:10.3f}".format(sum(precision_list)/num_labels), \
          "{:10.3f}".format(sum(recall_list)/num_labels), \
          "{:10.3f}".format(sum(F1_list)/num_labels))

    # for micro averaging, weight the scores for each label by the number of items
    #    this is better for labels with imbalance
    # first intialize a dictionary for label counts and then count them
    label_counts = {}
    for lab in labels:
      label_counts[lab] = 0 
    # count the labels
    for (doc, lab) in featuresets:
      label_counts[lab] += 1
    # make weights compared to the number of documents in featuresets
    num_docs = len(featuresets)
    label_weights = [(label_counts[lab] / num_docs) for lab in labels]
    print('\nLabel Counts', label_counts)
    #print('Label weights', label_weights)
    # print macro average over all labels
    print('Micro Average Precision\tRecall\t\tF1 \tOver All Labels')
    precision = sum([a * b for a,b in zip(precision_list, label_weights)])
    recall = sum([a * b for a,b in zip(recall_list, label_weights)])
    F1 = sum([a * b for a,b in zip(F1_list, label_weights)])
    print( '\t', "{:10.3f}".format(precision), \
      "{:10.3f}".format(recall), "{:10.3f}".format(F1))

In [24]:
# Function: compute precision, recall, and F1

def eval_measures(gold, predicted, labels):
    
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []

    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        # for small numbers, guard against dividing by zero in computing measures
        if (TP == 0) or (FP == 0) or (FN == 0):
          recall_list.append (0)
          precision_list.append (0)
          F1_list.append(0)
        else:
          recall = TP / (TP + FP)
          precision = TP / (TP + FN)
          recall_list.append(recall)
          precision_list.append(precision)
          F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    return (precision_list, recall_list, F1_list)

In [31]:
# Function: define bigram features

def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [35]:
# Function: write featuresets to CSV file

def writeFeatureSets(featuresets, outpath):
    # Open outpath for writing
    f = open(outpath, 'w')
    # Get the feature names from the feature dictionary in the first featureset
    featurenames = featuresets[0][0].keys()
    # Create the first list of the file as comma-separated feature names
    #   with the word class as the last feature name
    featurenameline = ''
    for featurename in featurenames:
        # Replace forbidden characters with text abbreviations
        featurename = featurename.replace(',','CM')
        featurename = featurename.replace("'","DQ")
        featurename = featurename.replace('"','QU')
        featurenameline += featurename + ','
        featurenameline += 'class'
    # Write this as the first line in the csv file
    f.write(featurenameline)
    f.write('\n')
    # Convert each feature set to a line in the file with comma separated feature values,
    # each feature value is converted to a string 
    #   for booleans this is the words true and false
    #   for numbers, this is the string with the number
    for featureset in featuresets:
        featureline = ''
        for key in featurenames:
            try:
                featureline += str(featureset[0].get(key, []))+','
            except KeyError:
                continue
        featureline += str(featureset[1])
        # Write each feature set values to the file
        f.write(featureline)
        f.write('\n')
    f.close()

In [60]:
# Function: define sentiment lexicon features

def readSubjectivity(path):
	flexicon = open(path, 'r')
	# initialize an empty dictionary
	sldict = { }
	for line in flexicon:
		fields = line.split()   # default is to split on whitespace
		# split each field on the '=' and keep the second part as the value
		strength = fields[0].split("=")[1]
		word = fields[2].split("=")[1]
		posTag = fields[3].split("=")[1]
		stemmed = fields[4].split("=")[1]
		polarity = fields[5].split("=")[1]
		if (stemmed == 'y'):
			isStemmed = True
		else:
			isStemmed = False
		# put a dictionary entry with the word as the keyword
		#     and a list of the other values
		sldict[word] = [strength, posTag, isStemmed, polarity]
	return sldict

In [64]:
# Function: define sentiment lexicon features

def SL_features(document, word_features, SL):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    # count variables for the 4 classes of subjectivity
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            features['positivecount'] = weakPos + (2 * strongPos)
            features['negativecount'] = weakNeg + (2 * strongNeg)      
    return features

In [79]:
# Function: retrieve poslist and neglist from LIWC dictionary

def read_words():
  poslist = []
  neglist = []

  flexicon = open('M:/Education/Syracuse/IST 664 - Natural Language Processing/Final Project/kagglemoviereviews/SentimentLexicons/liwcdic2007.dic', encoding='latin1')
  # read all LIWC words from file
  wordlines = [line.strip() for line in flexicon]
  # each line has a word or a stem followed by * and numbers of the word classes it is in
  # word class 126 is positive emotion and 127 is negative emotion
  for line in wordlines:
    if not line == '':
      items = line.split()
      word = items[0]
      classes = items[1:]
      for c in classes:
        if c == '126':
          poslist.append( word )
        if c == '127':
          neglist.append( word )
  return (poslist, neglist)

poslist, neglist = read_words()

In [91]:
# Function: define LIWC sentiment lexicon features

def liwc_features(doc, word_features,poslist,neglist):
  doc_words = set(doc)
  features = {}
  for word in word_features:
    features['contains({})'.format(word)] = (word in doc_words)
  pos = 0
  neg = 0
  for word in doc_words:
    if isPresent(word,poslist):
      pos += 1
    if isPresent(word,neglist):
      neg += 1
    features['positivecount'] = pos
    features['negativecount'] = neg
  if 'positivecount' not in features:
    features['positivecount']=0
  if 'negativecount' not in features:
    features['negativecount']=0  
  return features

In [97]:
# Function: define a combination of SL and LIWC lexicons

def SL_liwc_features(doc, word_features, SL,poslist,neglist):
  document_words = set(doc)
  features = {}
  for word in word_features:
    features['contains({})'.format(word)] = (word in document_words)
  # count variables for the 4 classes of subjectivity
  weakPos = 0
  strongPos = 0
  weakNeg = 0
  strongNeg = 0
  for word in document_words:
    if isPresent(word,poslist):
      strongPos += 1
    elif isPresent(word,neglist):
      strongNeg += 1
    elif word in SL:
      strength, posTag, isStemmed, polarity = SL[word]
      if strength == 'weaksubj' and polarity == 'positive':
        weakPos += 1
      if strength == 'strongsubj' and polarity == 'positive':
        strongPos += 1
      if strength == 'weaksubj' and polarity == 'negative':
        weakNeg += 1
      if strength == 'strongsubj' and polarity == 'negative':
        strongNeg += 1
    features['positivecount'] = weakPos + (2 * strongPos)
    features['negativecount'] = weakNeg + (2 * strongNeg)
  
  if 'positivecount' not in features:
    features['positivecount']=0
  if 'negativecount' not in features:
    features['negativecount']=0      
  return features

In [100]:
# Function: define part-of-speech tagging features

def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [107]:
# Function: Representing negation

def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = False
        features['V_NOT{}'.format(word)] = False
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features

negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [114]:
# Function: Bing Liu's Opinion Lexicon

dirPath = 'C:/Users/Jack Harris/! Syracuse Python/IST 664/kagglemoviereviews'

def read_opinionlexicon():
    POSITIVE_REVIEWS = 'C:/Users/Jack Harris/! Syracuse Python/IST 664/kagglemoviereviews/SentimentLexicons/rt-polarity-pos.txt'
    NEGATIVE_REVIEWS = 'C:/Users/Jack Harris/! Syracuse Python/IST 664/kagglemoviereviews/SentimentLexicons/rt-polarity-pos.txt'
    
    pos_features = []
    neg_features = []
    for line in open(POSITIVE_REVIEWS, 'r').readlines()[35:]:
        pos_words = re.findall(r"[\w']+|[.,!?;]", line.rstrip())
        pos_features.append(pos_words[0])
        
    for line in open(NEGATIVE_REVIEWS, 'r').readlines()[35:]:
        neg_words = re.findall(r"[\w']+|[.,!?;]", line.rstrip())
        neg_features.append(neg_words[0])
  
    return pos_features,neg_features

poslist2,neglist2 = read_opinionlexicon()

In [28]:
# I could not get this code to work when encapsulated in the processkaggle() function
# Featureset1: Bag of words / unigram (baseline)

vocab_size = 500
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

10227


In [26]:
label_list = [c for (d,c) in docs]
labels = list(set(label_list))
num_folds = 5

In [32]:
start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.219      0.174      0.194
1 	      0.210      0.358      0.265
2 	      0.815      0.616      0.702
3 	      0.240      0.407      0.302
4 	      0.175      0.254      0.207

Macro Average Precision	Recall		F1 	Over All Labels
	      0.332      0.362      0.334

Label Counts {0: 427, 1: 1744, 2: 5121, 3: 2158, 4: 550}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.525      0.487      0.490
125.82843351364136  seconds elapsed.


In [34]:
# Naive Bayes for featuresets(baseline)

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.524

In [37]:
# Featureset2: bigram

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words_list)
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
# print(bigram_features[:50])
featuresets2 = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featuresets2, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.219      0.174      0.194
1 	      0.210      0.358      0.265
2 	      0.815      0.616      0.702
3 	      0.240      0.407      0.302
4 	      0.175      0.254      0.207

Macro Average Precision	Recall		F1 	Over All Labels
	      0.332      0.362      0.334

Label Counts {0: 427, 1: 1744, 2: 5121, 3: 2158, 4: 550}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.525      0.487      0.490
256.7317645549774  seconds elapsed.


In [39]:
# Naive Bayes for featuresets2(bigrams)

train_set, test_set = featuresets2[1000:], featuresets2[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.524

In [66]:
# Featureset3: Sentiment Lexicon

SLpath = "M:/Education/Syracuse/IST 664 - Natural Language Processing/Final Project/kagglemoviereviews/SentimentLexicons/subjclueslen1-HLTEMNLP05.tff"
SL = readSubjectivity(SLpath)

featureset3 = [(SL_features(d, word_features, SL), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset3, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.258      0.180      0.212
1 	      0.272      0.369      0.313
2 	      0.744      0.661      0.700
3 	      0.354      0.427      0.387
4 	      0.234      0.240      0.236

Macro Average Precision	Recall		F1 	Over All Labels
	      0.373      0.375      0.370

Label Counts {0: 427, 1: 1744, 2: 5121, 3: 2158, 4: 550}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.529      0.516      0.519
121.68763709068298  seconds elapsed.


In [67]:
# Naive Bayes for featureset3 (sentiment lexicon)

train_set, test_set = featureset3[1000:], featureset3[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.532

In [93]:
# Featureset4 : LIWC

featureset4 = [(liwc_features(d, word_features,poslist,neglist), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset4, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.230      0.182      0.203
1 	      0.220      0.374      0.276
2 	      0.813      0.631      0.711
3 	      0.270      0.429      0.331
4 	      0.214      0.251      0.230

Macro Average Precision	Recall		F1 	Over All Labels
	      0.349      0.374      0.350

Label Counts {0: 427, 1: 1744, 2: 5121, 3: 2158, 4: 550}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.535      0.503      0.505
121.92599964141846  seconds elapsed.


In [94]:
# Naive Bayes for featureset4 (LIWC sentiment lexicon)

train_set, test_set = featureset4[1000:], featureset4[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.534

In [98]:
# Featureset5: Combination SL and LIWC

featureset5 = [(SL_liwc_features(d, word_features, SL, poslist, neglist), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset5, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.239      0.184      0.207
1 	      0.235      0.384      0.291
2 	      0.804      0.639      0.712
3 	      0.291      0.438      0.349
4 	      0.218      0.247      0.230

Macro Average Precision	Recall		F1 	Over All Labels
	      0.357      0.378      0.358

Label Counts {0: 427, 1: 1744, 2: 5121, 3: 2158, 4: 550}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.538      0.510      0.512
121.13510990142822  seconds elapsed.


In [99]:
# Naive Bayes for featureset5 (combination SL and LIWC)

train_set, test_set = featureset5[1000:], featureset5[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.535

In [102]:
# Featureset 6: Part-of-speech tagging

featureset6 = [(POS_features(d, word_features), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset6, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.287      0.160      0.205
1 	      0.213      0.351      0.265
2 	      0.794      0.624      0.699
3 	      0.218      0.398      0.282
4 	      0.190      0.223      0.204

Macro Average Precision	Recall		F1 	Over All Labels
	      0.340      0.351      0.331

Label Counts {0: 427, 1: 1744, 2: 5121, 3: 2158, 4: 550}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.513      0.486      0.485
124.73035550117493  seconds elapsed.


In [104]:
# Naive Bayes for featureset6 (part-of-speech tagging)

train_set, test_set = featureset6[1000:], featureset6[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.514

In [108]:
# Featureset 7: Representing negation

featureset7 = [(NOT_features(d, word_features, negationwords), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset7, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.492      0.139      0.217
1 	      0.216      0.364      0.271
2 	      0.673      0.685      0.679
3 	      0.274      0.422      0.332
4 	      0.344      0.186      0.241

Macro Average Precision	Recall		F1 	Over All Labels
	      0.400      0.359      0.348

Label Counts {0: 427, 1: 1744, 2: 5121, 3: 2158, 4: 550}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.481      0.522      0.489
262.31035470962524  seconds elapsed.


In [109]:
# Naive Bayes for featureset7 (representing negation)

train_set, test_set = featureset7[1000:], featureset7[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.505

In [118]:
# Featureset 8: Using Bing Liu's Opinion Lexicon, obtained at:
#   https://www.cs.uic.edu/~liub/

featureset8 = [(liwc_features(d, word_features,poslist2,neglist2), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset8, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.264      0.160      0.199
1 	      0.220      0.349      0.269
2 	      0.787      0.622      0.695
3 	      0.223      0.394      0.285
4 	      0.204      0.232      0.216

Macro Average Precision	Recall		F1 	Over All Labels
	      0.340      0.351      0.333

Label Counts {0: 427, 1: 1744, 2: 5121, 3: 2158, 4: 550}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.512      0.484      0.485
120.99084901809692  seconds elapsed.


In [119]:
# Naive Bayes for featureset8 (Bing Liu's Opinion Lexicon)

train_set, test_set = featureset8[1000:], featureset8[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.513

In [127]:
# Featureset1: Bag of words / unigram (baseline)

vocab_size = 1000
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

10445


In [128]:
# Featureset 1: Bag of words / unigram (baseline), vocabulary size 1000

start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.285      0.186      0.224
1 	      0.231      0.367      0.283
2 	      0.811      0.629      0.708
3 	      0.253      0.425      0.317
4 	      0.154      0.247      0.189

Macro Average Precision	Recall		F1 	Over All Labels
	      0.347      0.371      0.344

Label Counts {0: 439, 1: 1711, 2: 5150, 3: 2103, 4: 597}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.532      0.499      0.501
254.00768756866455  seconds elapsed.


In [129]:
# Naive Bayes for featuresets(baseline), vocab 1000

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.532

In [130]:
# Featureset2: bigram, vocab 1000

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words_list)
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
# print(bigram_features[:50])
featuresets2 = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featuresets2, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.285      0.186      0.224
1 	      0.231      0.367      0.283
2 	      0.811      0.629      0.708
3 	      0.253      0.425      0.317
4 	      0.154      0.247      0.189

Macro Average Precision	Recall		F1 	Over All Labels
	      0.347      0.371      0.344

Label Counts {0: 439, 1: 1711, 2: 5150, 3: 2103, 4: 597}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.532      0.499      0.501
406.28346014022827  seconds elapsed.


In [131]:
# Naive Bayes for featuresets2(bigrams), vocab 1000

train_set, test_set = featuresets2[1000:], featuresets2[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.532

In [132]:
# Featureset3: Sentiment Lexicon, vocab 1000

SLpath = "M:/Education/Syracuse/IST 664 - Natural Language Processing/Final Project/kagglemoviereviews/SentimentLexicons/subjclueslen1-HLTEMNLP05.tff"
SL = readSubjectivity(SLpath)

featureset3 = [(SL_features(d, word_features, SL), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset3, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.306      0.195      0.237
1 	      0.266      0.362      0.307
2 	      0.754      0.671      0.711
3 	      0.352      0.429      0.387
4 	      0.242      0.266      0.253

Macro Average Precision	Recall		F1 	Over All Labels
	      0.384      0.385      0.379

Label Counts {0: 439, 1: 1711, 2: 5150, 3: 2103, 4: 597}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.536      0.522      0.525
271.8794481754303  seconds elapsed.


In [133]:
# Naive Bayes for featureset3 (sentiment lexicon), vocab 1000

train_set, test_set = featureset3[1000:], featureset3[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.537

In [134]:
# Featureset4 : LIWC, vocab 1000

featureset4 = [(liwc_features(d, word_features,poslist,neglist), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset4, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.292      0.196      0.233
1 	      0.223      0.368      0.277
2 	      0.806      0.642      0.714
3 	      0.292      0.439      0.351
4 	      0.198      0.265      0.226

Macro Average Precision	Recall		F1 	Over All Labels
	      0.362      0.382      0.360

Label Counts {0: 439, 1: 1711, 2: 5150, 3: 2103, 4: 597}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.539      0.510      0.513
282.2764194011688  seconds elapsed.


In [135]:
# Naive Bayes for featureset4 (LIWC sentiment lexicon), vocab 1000

train_set, test_set = featureset4[1000:], featureset4[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.535

In [136]:
# Featureset5: Combination SL and LIWC, vocab 1000

featureset5 = [(SL_liwc_features(d, word_features, SL, poslist, neglist), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset5, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.294      0.197      0.235
1 	      0.238      0.390      0.295
2 	      0.806      0.650      0.720
3 	      0.307      0.443      0.363
4 	      0.217      0.276      0.242

Macro Average Precision	Recall		F1 	Over All Labels
	      0.372      0.391      0.371

Label Counts {0: 439, 1: 1711, 2: 5150, 3: 2103, 4: 597}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.546      0.520      0.522
273.61738204956055  seconds elapsed.


In [137]:
# Naive Bayes for featureset5 (combination SL and LIWC), vocab 1000

train_set, test_set = featureset5[1000:], featureset5[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.541

In [138]:
# Featureset 6: Part-of-speech tagging, vocab 1000

featureset6 = [(POS_features(d, word_features), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset6, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.328      0.170      0.223
1 	      0.220      0.349      0.270
2 	      0.799      0.636      0.708
3 	      0.228      0.416      0.295
4 	      0.179      0.234      0.202

Macro Average Precision	Recall		F1 	Over All Labels
	      0.351      0.361      0.340

Label Counts {0: 439, 1: 1711, 2: 5150, 3: 2103, 4: 597}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.522      0.496      0.495
275.3955705165863  seconds elapsed.


In [139]:
# Naive Bayes for featureset6 (part-of-speech tagging), vocab 1000

train_set, test_set = featureset6[1000:], featureset6[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.518

In [140]:
# Featureset 7: Representing negation, vocab 1000

featureset7 = [(NOT_features(d, word_features, negationwords), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset7, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.459      0.160      0.236
1 	      0.256      0.391      0.309
2 	      0.725      0.680      0.702
3 	      0.260      0.419      0.320
4 	      0.309      0.224      0.259

Macro Average Precision	Recall		F1 	Over All Labels
	      0.402      0.375      0.365

Label Counts {0: 439, 1: 1711, 2: 5150, 3: 2103, 4: 597}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.510      0.526      0.508
563.2704789638519  seconds elapsed.


In [141]:
# Naive Bayes for featureset7 (representing negation), vocab 1000

train_set, test_set = featureset7[1000:], featureset7[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.492

In [142]:
# Featureset 8: Using Bing Liu's Opinion Lexicon, vocab 1000

featureset8 = [(liwc_features(d, word_features,poslist2,neglist2), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset8, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.322      0.171      0.222
1 	      0.211      0.346      0.262
2 	      0.789      0.633      0.702
3 	      0.240      0.405      0.301
4 	      0.187      0.240      0.209

Macro Average Precision	Recall		F1 	Over All Labels
	      0.350      0.359      0.339

Label Counts {0: 439, 1: 1711, 2: 5150, 3: 2103, 4: 597}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.518      0.492      0.492
262.74332761764526  seconds elapsed.


In [143]:
# Naive Bayes for featureset8 (Bing Liu's Opinion Lexicon), vocab 1000

train_set, test_set = featureset8[1000:], featureset8[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.512

In [146]:
# Featureset1: Bag of words / unigram (baseline), vocab 500, 10 folds

vocab_size = 500
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

num_folds = 10
start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

10338
Each fold size: 1000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

Average Precision	Recall		F1 	Per Label
0 	      0.282      0.196      0.230
1 	      0.182      0.349      0.239
2 	      0.822      0.623      0.709
3 	      0.254      0.387      0.307
4 	      0.197      0.356      0.252

Macro Average Precision	Recall		F1 	Over All Labels
	      0.348      0.382      0.347

Label Counts {0: 442, 1: 1710, 2: 5087, 3: 2130, 4: 631}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.528      0.490      0.493
191.06860423088074  seconds elapsed.


In [147]:
# Naive Bayes for featuresets(baseline), vocab 500, 10 folds

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.529

In [144]:
# Featureset1: Bag of words / unigram (baseline), vocab 1000, 10 folds

vocab_size = 1000
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

num_folds = 10
start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

10340
Each fold size: 1000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

Average Precision	Recall		F1 	Per Label
0 	      0.239      0.180      0.204
1 	      0.251      0.370      0.298
2 	      0.822      0.634      0.716
3 	      0.267      0.452      0.336
4 	      0.182      0.288      0.220

Macro Average Precision	Recall		F1 	Over All Labels
	      0.352      0.385      0.355

Label Counts {0: 434, 1: 1745, 2: 5076, 3: 2160, 4: 585}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.540      0.509      0.510
409.29962730407715  seconds elapsed.


In [145]:
# Naive Bayes for featuresets(baseline), vocab 1000, 10 folds

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.538

In [148]:
# Featureset1: Bag of words / unigram (baseline), vocab 1500, 10 folds

vocab_size = 1500
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

num_folds = 10
start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

10417
Each fold size: 1000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

Average Precision	Recall		F1 	Per Label
0 	      0.194      0.157      0.172
1 	      0.260      0.396      0.313
2 	      0.816      0.619      0.704
3 	      0.254      0.435      0.320
4 	      0.217      0.300      0.250

Macro Average Precision	Recall		F1 	Over All Labels
	      0.348      0.381      0.352

Label Counts {0: 425, 1: 1858, 2: 4980, 3: 2144, 4: 593}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.530      0.500      0.499
594.736902475357  seconds elapsed.


In [149]:
# Naive Bayes for featuresets(baseline), vocab 1500, 10 folds

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.552