In [3]:
# NLTK Pre-trained sentiment analyzer
# Built in pretrained analyzer - VADER (Valence Aware Dictionary and sEntiment Reasoner)
# VADER best used for social media and short sentences
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# polarity analyzes the pos/neg words in a text
sia.polarity_scores("NLTK is the fkn shit!")

{'neg': 0.493, 'neu': 0.507, 'pos': 0.0, 'compound': -0.5983}

In [4]:
# twitter samples 
import nltk

# get list of raw tweets with strings
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]
print(len(tweets))
#tweets[:10]

# popularity scores for the tweets
from random import shuffle

def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise"""
    return sia.polarity_scores(tweet)["compound"] > 0

print("\n")
shuffle(tweets)
for tweet in tweets[:10]:
    print(">", is_positive(tweet), tweet)


30000


> True @CFindlaySnp15 if labour propose a queens speech it's up to the SNP to support it or not
> True @GzaDaRambler Gotcha :)
> True @Mburu__ Inter 3 UCL, Arsenal... Small team, Right! :)
> True RT @Tommy_Colc: Financial Times come out in support of Tories claiming Miliband is "preoccupied w/ inequality". The man who wrote it http:/…
> True Turned the debate off after Ed Miliband because I don't give a fuck what Nick Clegg has to say #bbcqt
> False RT @thomasmessenger: For all Tories claiming that Labour overspent and thus caused a global financial crisis, ahem... http//t.co/DkLwCwzhDA
> False @Kitchmo the SNP always get annoyed when questioned: they live in a parallel universe. Anyway don't tell me you don't want t to vote Blair
> False @KrystalHosting Was just about to push a client your way for some hosting. Maybe I had better wait till next week :(
> False Ed Miliband's ignorant refusal to talk about post-election deals is playing a very dangerous game, putting himself bef

In [5]:
# positive/negativie movie reviews - already been classified using VADER
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids
print(positive_review_ids[0])

# set up VADER to rate individual sentences rather than full reviews
# VADER needs raw strings for its rating => you can't use words()
# make a list of file IDs that corpus uses to reference individual reviews

pos/cv000_29590.txt


In [6]:
# redefine is positive to work on an entire review. Obtain specific review
# using its file ID and then split it into sentences before rating:
from statistics import mean

def is_positive(review_id: str) -> bool:
    """
    True if the average of all sentence compound scores is positive
    """
    
    # first get the raw text for the entire movie
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [
        (sia.polarity_scores(sentence)["neg"] and sia.polarity_scores(sentence)["compound"])
        # loop through sentences from tokenization
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

# shuffle the review ids and find if they are positive
shuffle(all_review_ids)
correct = 0
for review_id in all_review_ids:
    if is_positive(review_id):
        if review_id in positive_review_ids:
            correct += 1
    else:
        if review_id in negative_review_ids:
            correct += 1
            
print(F"{correct / len(all_review_ids):.2%} correct")
    

56.95% correct


In [7]:
# Customizing NLTK's Sentiment Analysis
# TRICK - figure out which properties of your dataset are 
# useful in classifying each piece of data into desired categories

# ML - these properties are known as features, which you select.

# Selecting useful features - by using predefined categ
# in movie_reviews corpus you can create positive/negative words.

# Determine which ones occur most frequently across each set.
# Begin excluding unwanted words and building initial category groups
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

# corpus names/words
corpus_words = nltk.corpus.names.words()

# skip unwanted words that don't have letters and are not in the unwated list
def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

# get the POS (Part of Speech) tags from the moview reviews
pos_reviews = nltk.corpus.movie_reviews.words(categories=["pos"])
neg_reviews = nltk.corpus.movie_reviews.words(categories=["neg"])
pos_tags = nltk.pos_tag(pos_reviews)
neg_tags = nltk.pos_tag(neg_reviews)

# filter out the positive tuples using the skip unwanted func
pos_words = [word for word, tag in filter(skip_unwanted, pos_tags)]
neg_words = [word for word, tag in filter(skip_unwanted, neg_tags)]
print(pos_words[123])
print(neg_words[123])

completely
pretty


In [8]:
# create frequency distributions for custom feature
# begin by finding the the common set of words to remove from distribution
positive_fd = nltk.FreqDist(pos_words)
negative_fd = nltk.FreqDist(neg_words)
print("positive fd len before = " + str(len(positive_fd)))

# find the intersection between pos and neg words
common_set = set(positive_fd).intersection(negative_fd)
print(len(common_set))
print(list(common_set)[0])

# delete the common words in the lists
for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

# unique pos/neg words in each freq dist, the amount of words in each set
# can be tweaked in order to determine effect on sentiment analysis
top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

positive fd len before = 15449
9511
immaculate


In [9]:
# example of a feature to extract from data, words that aren't neg/pos
# leverage collocations that carry positive meaning bigram finders (Eg "thumbs up!")
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
    if w.isalpha() and w not in unwanted
])

negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"])
    if w.isalpha() and w not in unwanted
])

print(positive_bigram_finder)

<nltk.collocations.BigramCollocationFinder object at 0x7f46c2196e50>


In [10]:
# training and using a classifier - define a func to extract features from data
# focus on features with positivity including VADER scores
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()
    
    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])
        
    # Adding 1 to the final compound score to always have pos numbers
    # since some classifiers you'll use later don't work with neg numbers
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount
    
    return features

In [22]:
# extract_features returns a dictionary - 3 features for each text
"""
1. The average compound score
2. The average postive score
3. Amount of words in the text that are part of top 100 positive reviews
"""

# to train/evaluate a classifier, you'll need to build a list 
# of features for each text you'll analyze
features = [
    # create tuple of the dictionary and review type
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

print("Features type 0:")
print(features[10])

Features type 0:
({'mean_compound': 1.0785804347826087, 'mean_positive': 0.09447826086956522, 'wordcount': 2}, 'pos')


In [39]:
# training classifier means splitting into train/eval
# use 1/4 of the set for training
# train_count = len(features) // 4
from sklearn import model_selection
train_set, test_set = model_selection.train_test_split(features, test_size = 0.25)
print("Train set size = %d" % len(train_set))
print("Test set size = %d" % len(test_set))

shuffle(train_set)
# classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(10)
nltk.classify.accuracy(classifier, test_set)

# try classifying a piece of the test set to see if it works
test_tuple = test_set[0]
test_set = test_tuple[0]
print("\n")
print(test_tuple)
classifier.classify(test_tuple[0])

Train set size = 1500
Test set size = 500
Most Informative Features
               wordcount = 3                 pos : neg    =     13.8 : 1.0
               wordcount = 5                 pos : neg    =     11.0 : 1.0
               wordcount = 2                 pos : neg    =      3.8 : 1.0
               wordcount = 4                 pos : neg    =      1.8 : 1.0
               wordcount = 0                 neg : pos    =      1.7 : 1.0
               wordcount = 1                 pos : neg    =      1.6 : 1.0
           mean_positive = 0.07608333333333334    neg : pos    =      1.0 : 1.0
           mean_positive = 0.08418181818181818    neg : pos    =      1.0 : 1.0
           mean_positive = 0.08611764705882353    neg : pos    =      1.0 : 1.0
           mean_positive = 0.10004761904761905    neg : pos    =      1.0 : 1.0


({'mean_compound': 0.7512529411764706, 'mean_positive': 0.06364705882352942, 'wordcount': 0}, 'neg')


'neg'

In [42]:
# create a new test features set using a random field in the movie reviews
# print "Naive Bayes Accuracy " + str(nltk.classify.accuracy(classifier, test_set)*100)
testFieldId = nltk.corpus.movie_reviews.fileids(categories=["pos"])[599]
testRaw = nltk.corpus.movie_reviews.raw(testFieldId)
testRawFeatures = extract_features(testRaw)
# classifier.classify(testRawTuple)
print(testRawFeatures)
classifier.classify(testRawFeatures)

{'mean_compound': 1.0431027027027027, 'mean_positive': 0.12127027027027026, 'wordcount': 1}


'pos'