In [1]:
# open python and nltk packages needed for processing
# while the semeval tweet task b data has tags for "positive", "negative", 
#  "objective", "neutral", "objective-OR-neutral", we will combine the last 3 into "neutral"
import os
import sys
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.collocations import *


# function to read tweet training file, train and test a classifier 
def processtweets():
  # convert the limit argument from a string to an int
  # initialize NLTK built-in tweet tokenizer
  twtokenizer = TweetTokenizer()
  
  f = open('C:/Users/jiebi/Desktop/Grad/Summer 2021/IST 664/Final/corpus/downloaded-tweeti-b-dist.tsv', 'r')
  # loop over lines in the file and use the first limit of them
  #    assuming that the tweets are sufficiently randomized
  tweetdata = []
  for line in f:
      line = str(line).replace('@', '')
      line = str(line).replace('#', '')
      line = line.strip()

      # each line has 4 items separated by tabs
      # ignore the tweet and user ids, and keep the sentiment and tweet text
      tweetdata.append(line.split('\t')[2:4])
  # create list of tweet documents as (list of words, label)
  # where the labels are condensed to just 3:  'pos', 'neg', 'neu'
  tweetdocs = []
  # add all the tweets except the ones whose text is Not Available
  for tweet in tweetdata:
    if (tweet[1] != 'Not Available'):
      # run the tweet tokenizer on the text string - returns unicode tokens, so convert to utf8
      tokens = twtokenizer.tokenize(tweet[1])

      if tweet[0] == '"positive"':
        label = 'pos'
      else:
        if tweet[0] == '"negative"':
          label = 'neg'
        else:
          if (tweet[0] == '"neutral"') or (tweet[0] == '"objective"') or (tweet[0] == '"objective-OR-neutral"'):
            label = 'neu'
          else:
            label = ''
      tweetdocs.append((tokens, label))

  return tweetdocs


In [2]:
all_tweets_doc = processtweets()

In [3]:
def eval_measures(gold, predicted, labels):
    
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []

    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        # for small numbers, guard against dividing by zero in computing measures
        if (TP == 0) or (FP == 0) or (FN == 0):
          recall_list.append (0)
          precision_list.append (0)
          F1_list.append(0)
        else:
          recall = TP / (TP + FP)
          precision = TP / (TP + FN)
          recall_list.append(recall)
          precision_list.append(precision)
          F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    return (precision_list, recall_list, F1_list)


In [4]:
def cross_validation_PRF(num_folds, featuresets, labels):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    # for the number of labels - start the totals lists with zeroes
    num_labels = len(labels)
    total_precision_list = [0] * num_labels
    total_recall_list = [0] * num_labels
    total_F1_list = [0] * num_labels

    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        goldlist = []
        predictedlist = []
        for (features, label) in test_this_round:
            goldlist.append(label)
            predictedlist.append(classifier.classify(features))
        
        (precision_list, recall_list, F1_list) \
                  = eval_measures(goldlist, predictedlist, labels)

        # for each label add to the sums in the total lists
        for i in range(num_labels):
            # for each label, add the 3 measures to the 3 lists of totals
            total_precision_list[i] += precision_list[i]
            total_recall_list[i] += recall_list[i]
            total_F1_list[i] += F1_list[i]

    precision_list = [tot/num_folds for tot in total_precision_list]
    recall_list = [tot/num_folds for tot in total_recall_list]
    F1_list = [tot/num_folds for tot in total_F1_list]
    # the evaluation measures in a table with one row per label
    print('\nAverage Precision\tRecall\t\tF1 \tPer Label')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))
    
    # print macro average over all labels - treats each label equally
    print('\nMacro Average Precision\tRecall\t\tF1 \tOver All Labels')

    p_dummy = 0
    r_dummy = 0
    f_dummy = 0
    
    for x in precision_list:
        p_dummy = p_dummy + x
    
    for x in recall_list:
        r_dummy = r_dummy + x
    
    for x in F1_list:
        f_dummy = f_dummy + x
    
    print('\t', "{:10.3f}".format(p_dummy/num_labels), \
          "{:10.3f}".format(r_dummy/num_labels), \
          "{:10.3f}".format(f_dummy/num_labels))

    # for micro averaging, weight the scores for each label by the number of items
    #    this is better for labels with imbalance
    # first intialize a dictionary for label counts and then count them
    label_counts = {}
    for lab in labels:
      label_counts[lab] = 0 
    # count the labels
    for (doc, lab) in featuresets:
      label_counts[lab] += 1
    # make weights compared to the number of documents in featuresets
    num_docs = len(featuresets)
    label_weights = [(label_counts[lab] / num_docs) for lab in labels]
    print('\nLabel Counts', label_counts)
    #print('Label weights', label_weights)
    # print macro average over all labels
    print('Micro Average Precision\tRecall\t\tF1 \tOver All Labels')
    
    precision = 0
    recall = 0
    F1 = 0
    for a,b in zip(precision_list, label_weights):
        precision = precision + a*b
        
    for a,b in zip(recall_list, label_weights):
        recall = recall + a*b
        
    for a,b in zip(F1_list, label_weights):
        F1 = F1 + a*b
    

    print( '\t', "{:10.3f}".format(precision), \
      "{:10.3f}".format(recall), "{:10.3f}".format(F1))

In [5]:
def eval_measures(gold, predicted, labels):
    
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []

    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        # for small numbers, guard against dividing by zero in computing measures
        if (TP == 0) or (FP == 0) or (FN == 0):
          recall_list.append (0)
          precision_list.append (0)
          F1_list.append(0)
        else:
          recall = TP / (TP + FP)
          precision = TP / (TP + FN)
          recall_list.append(recall)
          precision_list.append(precision)
          F1_list.append( 2 * (recall * precision) / (recall + precision))
    # the evaluation measures in a table with one row per label
    return (precision_list, recall_list, F1_list)

In [6]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [7]:
all_words = [word for (sentance,category) in all_tweets_doc for word in sentance]
top_words = nltk.FreqDist(all_words)
most_common_words = top_words.most_common(2000)
word_features = [word for (word,count) in most_common_words]

In [8]:
featuresets_uni = [(document_features(d, word_features), c) for (d, c) in all_tweets_doc]

In [9]:
#Defining the POS features function
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [10]:
POSfeaturesets = [(POS_features(d, word_features), c) for (d, c) in all_tweets_doc]

In [11]:
label_list = [c for (d,c) in all_tweets_doc]
labels = list(set(label_list))    # gets only unique labels
num_folds = 5

In [12]:
print("Original featureset")
cross_validation_PRF(num_folds, featuresets_uni, labels)

Original featureset
Each fold size: 1641

Average Precision	Recall		F1 	Per Label
pos 	      0.636      0.680      0.654
neg 	      0.523      0.396      0.447
neu 	      0.675      0.706      0.686

Macro Average Precision	Recall		F1 	Over All Labels
	      0.611      0.594      0.596

Label Counts {'pos': 3059, 'neg': 1207, 'neu': 3942}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.638      0.651      0.639


In [13]:
print("POS featureset")
cross_validation_PRF(num_folds, POSfeaturesets, labels)

POS featureset
Each fold size: 1641

Average Precision	Recall		F1 	Per Label
pos 	      0.615      0.679      0.643
neg 	      0.541      0.379      0.442
neu 	      0.669      0.707      0.683

Macro Average Precision	Recall		F1 	Over All Labels
	      0.608      0.589      0.589

Label Counts {'pos': 3059, 'neg': 1207, 'neu': 3942}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.630      0.649      0.633


In [14]:
new_tweets_doc = []
for x,y in all_tweets_doc:
    new_tweets_doc.append([x,y])

for i,x in new_tweets_doc:
    for tokens in range(len(i)):
        i[tokens] = i[tokens].lower()

In [15]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend([line.strip() for line in open('C:/Users/jiebi/Desktop/Grad/Summer 2021/IST 664/Final/stopwords_twitter.txt')])

newstopwords = [word for word in stopwords]

In [16]:
new_all_words_list = [word for (sent,cat) in new_tweets_doc for word in sent if word not in newstopwords]
new_all_words = nltk.FreqDist(new_all_words_list)
new_word_items = new_all_words.most_common(2000)
new_word_features = [word for (word,count) in new_word_items]

In [17]:
new_POSfeaturesets = [(document_features(d, new_word_features), c) for (d, c) in new_tweets_doc]

In [18]:
print("NEW POS featureset")
cross_validation_PRF(num_folds, new_POSfeaturesets, labels)

NEW POS featureset
Each fold size: 1641

Average Precision	Recall		F1 	Per Label
pos 	      0.644      0.668      0.652
neg 	      0.448      0.350      0.390
neu 	      0.654      0.689      0.666

Macro Average Precision	Recall		F1 	Over All Labels
	      0.582      0.569      0.569

Label Counts {'pos': 3059, 'neg': 1207, 'neu': 3942}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.620      0.631      0.620
