In [1]:
# NLTK Pre-trained sentiment analyzer
# Built in pretrained analyzer - VADER (Valence Aware Dictionary and sEntiment Reasoner)
# VADER best used for social media and short sentences
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# polarity analyzes the pos/neg words in a text
sia.polarity_scores("NLTK is the fkn shit!")

{'neg': 0.493, 'neu': 0.507, 'pos': 0.0, 'compound': -0.5983}

In [9]:
# twitter samples 
import nltk

# get list of raw tweets with strings
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]
print(len(tweets))
#tweets[:10]

# popularity scores for the tweets
from random import shuffle

def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise"""
    return sia.polarity_scores(tweet)["compound"] > 0

print("\n")
shuffle(tweets)
for tweet in tweets[:10]:
    print(">", is_positive(tweet), tweet)


30000


> True RT @Markfergusonuk: David Cameron says he's hungrier than he was five years ago. So are all of the people reliant on food banks...
> False RT @thomasmessenger: For all Tories claiming that Labour overspent and thus caused a global financial crisis, ahem... http//t.co/DkLwCwzhDA
> True RT @AlanRoden: Scenario: Lab largest party, minority Govt. SNP opposes policy, Lab won't budge. Cons vote against, but Lab has more MPs. Wh…
> False SNP leader faces audience questions: Nicola Sturgeon is grilled about the SNP's role at Westminster by a live ... http//t.co/WbnstcNnLd
> False RT @NPickavance: FT backs Tories! Who'd have guessed that FT leader writer Jonathan Ford was photod posing nxt to Boris in Uni club? http:/…
> True @derekrootboy @ae_parry So the SNP are going to vote with the Tories against Labour? Interesting.
> True Hi BAM ! @BarsAndMelody 
Can you follow my bestfriend @969Horan696 ? 
She loves you a lot :) 
See you in Warsaw &lt;3 
Love you &lt;3 x4
> True RT @gavth

In [12]:
# positive/negativie movie reviews - already been classified using VADER
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids

# set up VADER to rate individual sentences rather than full reviews
# VADER needs raw strings for its rating => you can't use words()
# make a list of file IDs that corpus uses to reference individual reviews

In [21]:
# redefine is positive to work on an entire review. Obtain specific review
# using its file ID and then split it into sentences before rating:
from statistics import mean

def is_positive(review_id: str) -> bool:
    """
    True if the average of all sentence compound scores is positive
    """
    
    # first get the raw text for the entire movie
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [
        (sia.polarity_scores(sentence)["neg"] and sia.polarity_scores(sentence)["compound"])
        # loop through sentences from tokenization
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

# shuffle the review ids and find if they are positive
shuffle(all_review_ids)
correct = 0
for review_id in all_review_ids:
    if is_positive(review_id):
        if review_id in positive_review_ids:
            correct += 1
    else:
        if review_id in negative_review_ids:
            correct += 1
            
print(F"{correct / len(all_review_ids):.2%} correct")
    

56.95% correct


In [38]:
# Customizing NLTK's Sentiment Analysis
# TRICK - figure out which properties of your dataset are 
# useful in classifying each piece of data into desired categories

# ML - these properties are known as features, which you select.

# Selecting useful features - by using predefined categ
# in movie_reviews corpus you can create positive/negative words.

# Determine which ones occur most frequently across each set.
# Begin excluding unwanted words and building initial category groups
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

# corpus names/words
corpus_words = nltk.corpus.names.words()

# skip unwanted words that don't have letters and are not in the unwated list
def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

# get the POS (Part of Speech) tags from the moview reviews
pos_reviews = nltk.corpus.movie_reviews.words(categories=["pos"])
neg_reviews = nltk.corpus.movie_reviews.words(categories=["neg"])
pos_tags = nltk.pos_tag(pos_reviews)
neg_tags = nltk.pos_tag(neg_reviews)

# filter out the positive tuples using the skip unwanted func
pos_words = [word for word, tag in filter(skip_unwanted, pos_tags)]
neg_words = [word for word, tag in filter(skip_unwanted, neg_tags)]
print(pos_words[123])
print(neg_words[123])

completely
pretty


In [52]:
# create frequency distributions for custom feature
# begin by finding the the common set of words to remove from distribution
positive_fd = nltk.FreqDist(pos_words)
negative_fd = nltk.FreqDist(neg_words)
print("positive fd len before = " + str(len(positive_fd)))

# find the intersection between pos and neg words
common_set = set(positive_fd).intersection(negative_fd)
print(len(common_set))
print(list(common_set)[0])

# delete the common words in the lists
for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

# unique pos/neg words in each freq dist, the amount of words in each set
# can be tweaked in order to determine effect on sentiment analysis
top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

positive fd len before = 15449
9511
cheaply


In [None]:
# example of a feature to extract from data, words that aren't neg/pos
# bigram finders (Eg "thumbs up!")
