In [1]:
import os
import sys
import random
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.collocations import *
from nltk.metrics import ConfusionMatrix
import re

## Data Preprocessing 
- remove stopwords from NLTK corpus and some negation words 
- lowercase all words
- used regular expression to remove all non-alphabetes

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
modStopwords = [word for word in stopwords if word not in ['not', 'no', 'can', 'don', 't']]

def preProcessdata(data):
    wordList = re.split('\s+', data.lower())
    punctuation = re.compile(r'[-.?!/\%@,":;()|0-9]')
    wordList = [punctuation.sub("", word) for word in wordList]
    finalWordList = []
    for word in wordList:
        if word not in modStopwords:
            finalWordList.append(word)
    res = " ".join(finalWordList)
    return res


In [3]:
def generateWords(data):
    wordDict = []
    for (words, sentiment) in data:
        possible_words = [x for x in words if len(x) >= 3]
        wordDict.extend(possible_words)
    return wordDict


def generateWordsnormal(data):
    wordDict = []
    for (words, sentiment) in data:
        wordDict.extend(words)
    return wordDict


def generateallWords(data):
    wordDict = []
    for id, words in data:
        wordDict.extend(words)
    return wordDict

## Define Featuresets
The main idea of this section is to use unigram and bigram feature to train and test a Naive Bayes Classifier 
- Use bag-of-word method (unigram feature) to find the top 200 most frequent words, and define them as wordFeatures 
- Also define a bigram feature. I will use BoW model as the baseline for model accuracy, and see what methods can help improve model accuracy comparing BoW model
- For unigram feature, label each keywrod in wrodFeatures as 'V_keyword' also labeled bigram features as V_{}, B_{} _{}.
- In addition to BOW and bigram featureset, combine two featureset  'combinedfeatureset' as a new featureset to test out if can improve model accuracy

In [49]:
def bagofWords(wordList):
    wordlist = nltk.FreqDist(wordList)
    wordFeatures = [w for (w, c) in wordlist.most_common(200)]
    return wordFeatures


def unigram_features(data, wordFeatures):
    documentWords = set(data)
    features = {}
    for word in wordFeatures:
        features['V_{}'.format(word)] = (word in documentWords)
    return features

def bigram_finder(wordList):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(wordList, window_size=3)
    bigram_features = finder.nbest(bigram_measures.chi_sq, 3000)
    return bigram_features[:500]

def bigram_features(data, wordFeatures, bigramFeatures):
    document_words = set(data)
    document_bigrams = nltk.bigrams(data)
    features = {}
    for word in wordFeatures:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigramFeatures:
        features['B_{}{})'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
    return features





In [64]:
def generateCSV(featuresets, filePath):
    filepath='/Users/jenniferchen/Desktop'
    f = open(filePath, 'w')
    featurenames = featuresets[0][0].keys()
    featurenameline = ''
    for featurename in featurenames:
        featurename = featurename.replace(',', 'CM')
        featurename = featurename.replace("'", "DQ")
        featurename = featurename.replace('"', 'QU')
        featurenameline += featurename + ','
    featurenameline += 'class'
    f.write(featurenameline)
    f.write('\n')
    for featureset in featuresets:
        featureline = ''
        for key in featurenames:
            featureline += str(featureset[0][key]) + ','
        if featureset[1] == 0:
            featureline += str("neg")
        elif featureset[1] == 1:
            featureline += str("sneg")
        elif featureset[1] == 2:
            featureline += str("neu")
        elif featureset[1] == 3:
            featureline += str("spos")
        elif featureset[1] == 4:
            featureline += str("pos")
        f.write(featureline)
        f.write('\n')
    f.close()

In [38]:
def combined_document_features(document, word_features, bigram_features, SL):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in document_words:
        posword = 0
        neutword = 0
        negword = 0
        for word in document_words:
            if word in SL[0]:
                posword += 1
            if word in SL[1]:
                neutword += 1
            if word in SL[2]:
                negword += 1
            features['positivecount'] = posword
            features['neutralcount'] = neutword
            features['negativecount'] = negword
        for word in word_features:
            features['V_{}'.format(word)] = False
            features['V_NOT{}'.format(word)] = False
        for bigram in bigram_features:
            features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
        return features

- Use Naive Bayes Classifier to train on the featuresets defined above.

In [53]:
def generateAccuracy(featuresets):
    training_size = int(0.1 * len(featuresets))
    test_set = featuresets[:training_size]
    training_set = featuresets[training_size:]
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    print("Accuracy : ")
    print(nltk.classify.accuracy(classifier, test_set))
    generateMatrix(classifier, test_set)
    print("")

## Subjectivity Lexicon

In [51]:
def readSubjectivity(filePath):
    flexicon = open(filePath, 'r')
    slex = {}
    for line in flexicon:
        fields = line.split()
        strength = fields[0].split("=")[1]
        word = fields[2].split("=")[1]
        posTag = fields[3].split("=")[1]
        stemmed = fields[4].split("=")[1]
        polarity = fields[5].split("=")[1]
        if (stemmed == 'y'):
            isStemmed = True
        else:
            isStemmed = False
        slex[word] = [strength, posTag, isStemmed, polarity]
    return slex


SLpath = "/Users/jenniferchen/Desktop/subjclueslen1-HLTEMNLP05.tff"
SL = readSubjectivity(SLpath)


def SL_features(document, word_features, SL):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            features['positivecount'] = weakPos + (2 * strongPos)
            features['negativecount'] = weakNeg + (2 * strongNeg)
    if 'positivecount' not in features:
        features['positivecount'] = 0
    if 'negativecount' not in features:
        features['negativecount'] = 0
    return features

In [52]:
def processkaggle(dirPath, limitStr):
    limit = 10000
    #os.chdir(dirPath)
    f = open('./train.tsv', 'r')
    sentenceData = []
    for line in f:
        if (not line.startswith('Phrase')):
            line = line.strip()
            sentenceData.append(line.split('\t')[2:4])
    random.shuffle(sentenceData)
    sentenceList = sentenceData[:limit]
    print('Read', len(sentenceData), 'phrases, using', len(sentenceList), 'random phrases')
    phrasedocs = []
    phrasedocs_without = []
    for phrase in sentenceList:
        tokens = nltk.word_tokenize(phrase[0])
        phrasedocs_without.append((tokens, int(phrase[1])))
        tokenizer = RegexpTokenizer(r'\w+')
        phrase[0] = preProcessdata(phrase[0])
        tokens = tokenizer.tokenize(phrase[0])
        phrasedocs.append((tokens, int(phrase[1])))
    normaltokens = generateWordsnormal(phrasedocs_without)
    preprocessedTokens = generateWords(phrasedocs)
    word_features = bagofWords(normaltokens)
    featuresets_without_preprocessing = [(unigram_features(d, word_features), s) for (d, s) in phrasedocs_without]
    print(" ")
    print("Accuracy without pre-processing unigram features : ")
    generateAccuracy(featuresets_without_preprocessing)
    word_features = bagofWords(preprocessedTokens)
    featuresets = [(unigram_features(d, word_features), s) for (d, s) in phrasedocs]
    print(" ")
    print("Accuracy with pre-processed unigram features : ")
    generateAccuracy(featuresets)
    SL_featuresets = [(SL_features(d, word_features, SL), c) for (d, c) in phrasedocs]
    
    print("Accuracy with SL_featuresets : ")
    generateAccuracy(SL_featuresets)
    bigram_feature = bigram_finder(preprocessedTokens)
    bigram_featuresets = [(bigram_features(d, word_features, bigram_feature), c) for (d, c) in phrasedocs]
    print(" ")
    print("Accuracy with bigram featuresets : ")
    generateAccuracy(bigram_featuresets)
    
    '''
    features_combined = [(combined_document_features(d, word_features, SL_featuresets, bigram_featuresets), c) for
                         (d, c) in phrasedocs]
    print("Accuracy with combined featuresets : ")
    generateAccuracy(features_combined)
    '''

    
    


In [54]:
def generateMatrix(classifier_type, test_set):
    goldlist = []
    predictlist = []
    for (features, label) in test_set:
        goldlist.append(label)
        predictlist.append(classifier_type.classify(features))
    print("Confusion matrix")
    cm = ConfusionMatrix(goldlist, predictlist)
    print(cm)
    print("Confusion matrix in percentage")
    print(cm.pretty_format(sort_by_count=True,show_percents=True,truncate=9))
    print('Model Evaluation')
    eval_measures(goldlist,predictlist)
    print("")


In [55]:
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)
        precision = TP / (TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

In [56]:
def create_test_submission(setFeatures, test_featuresets, fileName):
    print("Training and testing a classifier ")
    test_set = test_featuresets
    training_set = setFeatures
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    fw = open(fileName, "w")
    fw.write("PhraseId" + ',' + "Sentiment" + '\n')
    for test, id in test_featuresets:
        fw.write(str(id) + ',' + str(classifier.classify(test)) + '\n')
    fw.close()

In [38]:
def combined_document_features(document, word_features, bigram_features, SL):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in document_words:
        posword = 0
        neutword = 0
        negword = 0
        for word in document_words:
            if word in SL[0]:
                posword += 1
            if word in SL[1]:
                neutword += 1
            if word in SL[2]:
                negword += 1
            features['positivecount'] = posword
            features['neutralcount'] = neutword
            features['negativecount'] = negword
        for word in word_features:
            features['V_{}'.format(word)] = False
            features['V_NOT{}'.format(word)] = False
        for bigram in bigram_features:
            features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
        return features

In [65]:
if __name__ == '__main__':
    if len(sys.argv) != 3:
        print('usage: classifyKaggle.py <corpus-dir> <limit>')
        sys.exit(0)
    processkaggle(sys.argv[1], sys.argv[2])

Read 156060 phrases, using 10000 random phrases
 
Accuracy without pre-processing unigram features : 
Accuracy : 
0.502
Confusion matrix
  |   0   1   2   3   4 |
--+---------------------+
0 | <13>  7  18   6   . |
1 |  20 <27>116  15   6 |
2 |  12  33<410> 36   8 |
3 |  15   8 137 <45> 10 |
4 |   7   1  34   9  <7>|
--+---------------------+
(row = reference; col = test)

Confusion matrix in percentage
  |      2      3      1      4      0 |
--+------------------------------------+
2 | <41.0%>  3.6%   3.3%   0.8%   1.2% |
3 |  13.7%  <4.5%>  0.8%   1.0%   1.5% |
1 |  11.6%   1.5%  <2.7%>  0.6%   2.0% |
4 |   3.4%   0.9%   0.1%  <0.7%>  0.7% |
0 |   1.8%   0.6%   0.7%      .  <1.3%>|
--+------------------------------------+
(row = reference; col = test)

Model Evaluation
	Precision	Recall		F1
0 	      0.295      0.194      0.234
1 	      0.147      0.355      0.208
2 	      0.822      0.573      0.675
3 	      0.209      0.405      0.276
4 	      0.121      0.226      0.157


 
Accura