In [1]:
# load in dependent libraries
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk.classify
from collections import defaultdict
import re
from nltk import *
import collections, itertools
from nltk.metrics import precision, recall
#from featx import bag_of_words, high_information_words

In [2]:
# load in the training data
train_df = pd.read_csv("data/HCR/train/orig/HCR_formatted_train.csv")
# retrieve the possible labels from the training data
categories = list(set(train_df['sentiment']))
categories

['neutral', 'negative', 'positive']

In [3]:
# mannaully coppied the featx bag_of_words and high_information_words module since featx package is not available for conda
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)

    return high_info_words



def bag_of_words(words):
    return dict([(word, True) for word in words])

REMINDER! : CONSIDER REMOVING STOPWORDS, STEMMING

In [4]:
def tokenize(tweet):
    tokens = word_tokenize(tweet)
    tokens = [token.lower() for token in tokens]
    # remove items that are not alphabetics
    for token in tokens:
        if token.isalpha() == False:
            tokens.remove(token)
    # remove punctuation and other tokens
    for token in tokens:
        if token in '!;@#$%^&*().,\/?~1234567890':
            tokens.remove(token)
    
    # remove urls
    regex = re.compile(r'^[/]+')
    for token in tokens:
        if regex.match(token):
            tokens.remove(token)
    
    bag = bag_of_words(tokens)
    return bag
    

In [5]:
# create list of training features
train_feats = list ()
for i, row in train_df.iterrows():
    if row[2] in categories:
        bag_cat = row[2]
    tweet_text = row[1]
    # tokenize tweets and put tokens in bag of words
    bag = tokenize(tweet_text)
    train_feats.append((bag, bag_cat))


In [6]:
# obtain the high information words
def high_information(train_feats, categories):
    labelled_words = [(category, []) for category in categories]

    # convert the formatting of our features to that required by high_information_words
    words = defaultdict(list)
    all_words = list()
    for category in categories:
        words[category] = list()

    for feat in train_feats:
        category = feat[1]
        bag = feat[0]
        for w in bag.keys():
            words[category].append(w)
            all_words.append(w)
            #break

    labelled_words = [(category, words[category]) for category in categories]
    
    # calculate high information words
    # note: to adjust min occurence score add min_score = n
    high_info_words = set(high_information_words(labelled_words,min_score = 2))
    #print(high_info_words)
    #high_info_words contains a list of high-information words. You may want to use only these for classification.
    # You can restrict the words in a bag of words to be in a given 2nd list (e.g. in function read_files)
    # e.g. bag_of_words_in_set(words, high_info_words)
    
    print("  Number of words in the data: %i" % len(all_words))
    print("  Number of distinct words in the data: %i" % len(set(all_words)))
    print("  Number of distinct 'high-information' words in the data: %i" % len(high_info_words))

    return high_info_words


In [7]:
high_info_words = high_information(train_feats, categories)

  Number of words in the data: 13428
  Number of distinct words in the data: 3110
  Number of distinct 'high-information' words in the data: 1392


In [8]:
# trains a classifier
def train(train_feats):
    classifier = nltk.classify.NaiveBayesClassifier.train(train_feats)
    return classifier
    # the following code uses the classifier with add-1 smoothing (Laplace)
    # You may choose to use that instead
    #from nltk.probability import LaplaceProbDist
    #classifier = nltk.classify.NaiveBayesClassifier.train(train_feats, estimator=LaplaceProbDist)


In [9]:
train_feats[0]

({'rt': True,
  'angelsmomaw': True,
  'hcr': True,
  'is': True,
  'unwanted': True,
  'because': True,
  'it': True,
  'will': True,
  'bankrupt': True,
  'the': True,
  'usa': True,
  'and': True,
  'give': True,
  'below': True,
  'inferior': True,
  'healthcare': True,
  'for': True,
  'all': True,
  'gop': True,
  'tcot': True,
  'tweetcongress': True},
 'negative')

In [10]:
classifier = train(train_feats)

In [11]:
classifier

<nltk.classify.naivebayes.NaiveBayesClassifier at 0x1cef5d1d160>

In [13]:
# load in the development data
dev_df = pd.read_csv("data/HCR/dev/orig/HCR_formatted_dev.csv")
# retrieve the possible labels from the training data
categories = list(set(dev_df['sentiment']))
categories

['neutral', 'negative', 'positive']

In [15]:
# create list of development features
dev_feats = list ()
for i, row in dev_df.iterrows():
    if row[2] in categories:
        bag_cat = row[2]
    tweet_text = row[1]
    # tokenize tweets and put tokens in bag of words
    bag = tokenize(tweet_text)
    dev_feats.append((bag, bag_cat))


In [22]:
def evaluation(classifier, test_feats, categories):
    print ("\n##### Evaluation...")
    print("  Accuracy: %f" % nltk.classify.accuracy(classifier, test_feats))
    precisions, recalls = precision_recall(classifier, test_feats)
    print(precisions, recalls)
    f_measures = calculate_f(precisions, recalls)  

    print(" |-----------|-----------|-----------|-----------------|")
    print(" |%-11s|%-11s|%-11s|%-11s|" % ("category","precision","recall","F-measure"))
    print(" |-----------|-----------|-----------|-----------------|")
    for category in categories:
        if precisions[category] is None:
            print(" |%-11s|%-11s|%-11s|%-11s|" % (category, "NA", "NA", "NA"))
        else:
            print(" |%-11s|%-11f|%-11f|%-11s|" % (category, precisions[category], recalls[category], f_measures[category]))
    print(" |-----------|-----------|-----------|------------------|")

In [23]:
from classification import precision_recall

In [24]:
def calculate_f(precisions, recalls):
    f_measures = {}
    #TODO calculate the f measure for each category using as input the precisions and recalls
    for precision, recall in zip(precisions.items(), recalls.items()):
        f_measures[precision[0]] = ((2*precision[1])*recall[1])/ (precision[1] + recall[1])
    return f_measures

In [25]:
def precision_recall(classifier, testfeats):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    
    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    
    precisions = {}
    recalls = {}
    
    for label in classifier.labels():
        precisions[label] = precision(refsets[label], testsets[label])
        recalls[label] = recall(refsets[label], testsets[label])
    
    return precisions, recalls

In [26]:
evaluation(classifier, dev_feats, categories)


##### Evaluation...
  Accuracy: 1.000000
{'negative': 1.0, 'neutral': None, 'positive': None} {'negative': 1.0, 'neutral': None, 'positive': None}


TypeError: unsupported operand type(s) for *: 'int' and 'NoneType'