In [1]:
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

import pandas as pd



In [2]:
DATASET_PATH = '../datasets/CHUNK_RC_2018-02-28'
SAVE_CLASSIFIER = '../datasets/nltk_sentiment_NBClassifier'

In [3]:
df = pd.read_json(DATASET_PATH, lines=True, chunksize=1e4).read()
df = df[(df.body != '[deleted]') & (df.body != '[removed]')]

In [4]:
all_comments = [(tokenize.word_tokenize(comment.body)+[str(comment.score)],
                 comment.controversiality) for comment in df.itertuples()]

no_contro = [comment for comment in all_comments if not comment[1]]
contro = [comment for comment in all_comments if comment[1]]

print(len(no_contro), len(contro))

9229 210


In [5]:
training, testing = split_train_test(all_comments)

In [6]:
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training])

In [7]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [8]:
def vader_feats(document, analyzer):
    # document is list of words, where last element is score
    vs = analyzer.polarity_scores(' '.join(document[:-1]))
    return {k+'_VAD': v for k, v in vs.items()}

sentim_analyzer.add_feat_extractor(vader_feats, analyzer=SentimentIntensityAnalyzer())

In [9]:
def score_feat(document):
    return {'_score': document[-1]}

sentim_analyzer.add_feat_extractor(score_feat)

In [10]:
training_set = sentim_analyzer.apply_features(training)
test_set = sentim_analyzer.apply_features(testing)

In [11]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
evaluate = sentim_analyzer.evaluate(test_set, verbose=True)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.9088983050847458
F-measure [0]: 0.9522222222222222
F-measure [1]: 0.022727272727272728
Precision [0]: 0.9788692175899486
Precision [1]: 0.014598540145985401
Recall [0]: 0.9269875608436993
Recall [1]: 0.05128205128205128


In [12]:
from sklearn.utils import resample

# downsampling to fix imbalance
no_contro_down = resample(no_contro, replace=False, n_samples=len(contro), random_state=123456)
train_down, test_down = split_train_test(no_contro_down + contro)

sa_down = SentimentAnalyzer()
allneg_down = sa_down.all_words([mark_negation(doc) for doc in train_down])
unifeats_down = sa_down.unigram_word_feats(allneg_down, min_freq=4)
sa_down.add_feat_extractor(extract_unigram_feats, unigrams=unifeats_down)
sa_down.add_feat_extractor(vader_feats, analyzer=SentimentIntensityAnalyzer())
sa_down.add_feat_extractor(score_feat)

trainset_down = sa_down.apply_features(train_down)
testset_down = sa_down.apply_features(test_down)
classifier_down = sa_down.train(trainer, trainset_down, save_classifier=SAVE_CLASSIFIER)
evaluate_down = sa_down.evaluate(testset_down, verbose=True)

Training classifier
Saving ../datasets/nltk_sentiment_NBClassifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.6904761904761905
F-measure [0]: 0.74
F-measure [1]: 0.6176470588235294
Precision [0]: 0.6981132075471698
Precision [1]: 0.6774193548387096
Recall [0]: 0.7872340425531915
Recall [1]: 0.5675675675675675
