In [None]:
import urllib.request
sonnetsURL = "http://www.gutenberg.org/files/1041/1041.txt"
sonnetString = urllib.request.urlopen(sonnetsURL).read().decode()

In [None]:
len(sonnetString)

In [None]:
start = sonnetString.find("  I\r\n")
end = sonnetString.find("End of Project Gutenberg")
filteredSonnetString = sonnetString[start:end].rstrip()
print(filteredSonnetString)

In [None]:
import re
sonnetsList = re.split("  [A-Z]+\r\n\r\n", filteredSonnetString)
print(sonnetsList[1])

In [None]:
print(sonnetsList[18])

In [None]:
import os
sonnetsPath = 'sonnets'
if not os.path.exists(sonnetsPath):
    os.makedirs(sonnetsPath)

In [None]:
print(str(1).zfill(3))
print(str(150).zfill(3))

In [None]:
for index, sonnet in enumerate(sonnetsList):
    if len(sonnet.strip()) > 0:
        filename = str(index).zfill(3)+".txt"
        pathname = os.path.join(sonnetsPath, filename)
        f = open(pathname, "w")
        f.write(sonnet.strip())
        f.close()
        

In [None]:
from nltk.corpus import PlaintextCorpusReader
sonnetsPath = 'sonnets'

sonnetsCorpus = PlaintextCorpusReader(sonnetsPath, '.*txt')


In [None]:
len(sonnetsCorpus.fileids())

In [None]:
def corpus_summary(corpus):
    print("This corpus has")
    print("  ", '{:,}'.format(len(sonnetsCorpus.fileids())), "files")
    tokens = corpus.words()
    print("  ", '{:,}'.format(len(tokens)), "tokens")
    words = [word for word in tokens if word[0].isalpha()]
    print("  ", '{:,}'.format(len(words)), "words")
    print("  ", '{:,}'.format(len(set(words))), "unique word types")

corpus_summary(sonnetsCorpus)

In [None]:
corpusTokens = sonnetsCorpus.words()
print([word for word in corpusTokens])

In [None]:
short_sentiments = {
    "pos": {
        "love": 1,
        "like": .5
    },
    "neg": {
        "hate": -1,
        "dislike": -.5
    }
}

In [None]:
import nltk
documents = {}
for fileid in sonnetsCorpus.fileids():
    text = sonnetsCorpus.raw(fileid).lower()
    tokens = nltk.word_tokenize(text)
    score = 0
    for polarity, words_dict in short_sentiments.items():
        for word, value in words_dict.items():
            score += tokens.count(word) * value
    documents[fileid] = score
    

In [None]:
valuesFreqs = nltk.FreqDist(documents)
mostFreqField = valuesFreqs.max()
valuesFreqs.tabulate(10)

In [None]:
print(sonnetsCorpus.raw(mostFreqField))

In [None]:
from nltk.corpus import sentiwordnet as swn
for senti_synset in swn.senti_synsets('good'):
    print(senti_synset, senti_synset.synset.definition())

In [None]:
for senti_synset in swn.senti_synsets('wicked', 'a'):
    print(senti_synset, senti_synset.synset.definition())

In [None]:

text = "This is a good sentence."
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)
tagged

In [None]:
def treebank_to_wordnet_pos(treebank, skipWordNetPos=[]):
    if "NN" in treebank and "n" not in skipWordNetPos:
        return "n"
    elif "JJ" in treebank and "a" not in skipWordNetPos:
        return "a"
    elif "VB" in treebank and "v" not in skipWordNetPos:
        return "v"
    elif "RB" in treebank and "r" not in skipWordNetPos:
        return "r"

In [None]:
for word, treebank in tagged:
    wordnet_pos = treebank_to_wordnet_pos(treebank, ["v"])
    if wordnet_pos:
        print(word, wordnet_pos)

In [None]:
for word, treebank in tagged:
    wordnet_pos = treebank_to_wordnet_pos(treebank, ["v"])
    if wordnet_pos: # only print matches
        print(word)
        for senti_synset in swn.senti_synsets(word, wordnet_pos):
            print("  ", senti_synset)

In [None]:
def get_sentiment_score_from_tagged(token, treebank, skipWordNetPos=[]):
    wordnet_pos = treebank_to_wordnet_pos(treebank, skipWordNetPos)
    if wordnet_pos: # only print matches
        senti_synsets = list(swn.senti_synsets(token, wordnet_pos))
        if senti_synsets:
            return senti_synsets[0].pos_score() - senti_synsets[0].neg_score()

In [None]:
for word, treebank in tagged:
    score = get_sentiment_score_from_tagged(word, treebank, ["v"])
    if score:
        print(word, score)

In [None]:
def get_sentiment_data_from_tokens(tokens, skipWordNetPos=[]):
    tagged = nltk.pos_tag(tokens)
    positives = []
    negatives = []
    tokens_score = 0
    for word, treebank in tagged:
        score = get_sentiment_score_from_tagged(word, treebank, skipWordNetPos)
        if score:
            tokens_score += score
            if score > 0:
                positives.append(word.lower())
            else:
                negatives.append(word.lower())
    return tokens_score, set(positives), set(negatives)

In [None]:
get_sentiment_data_from_tokens(tokens)

In [None]:
def get_sentiments_data_from_corpus(corpus, skipWordNetPos=[]):
    documents = {}
    all_positives = []
    all_negatives = []
    for fileid in corpus.fileids():
        tokens = corpus.words(fileid)
        score, positives, negatives = get_sentiment_data_from_tokens(tokens, skipWordNetPos)
        documents[fileid] = score
        [all_positives.append(positive) for positive in positives]
        [all_negatives.append(negative) for negative in negatives]
    return documents, set(all_positives), set(all_negatives)

In [None]:
sonnetsSentimentValues, sonnetsPositives, sonnetsNegatives = get_sentiments_data_from_corpus(sonnetsCorpus)

In [None]:
sonnetsSentimentFreqs = nltk.FreqDist(sonnetsSentimentValues)

In [None]:
sonnetsSentimentFreqs.tabulate(10)

In [None]:
%matplotlib inline
sonnetsSentimentFreqs.plot()

In [None]:
import numpy
numpy.mean([val for doc, val in sonnetsSentimentFreqs.items()])

In [None]:
import re
def get_html_for_sentiment_data(text, positives, negatives):
    # the regular expression combines all of the positive and negative words for a search, e.g. (love|like)
    # it then surrounds the word found in parentheses with styling, green for positive, red for negative
    if len(negatives) > 0:
        text = re.sub(r'\b(' + '|'.join(negatives) + r')\b', r'<span style="color:red">\1</span>', text)  
    if len(positives) > 0:
        text = re.sub(r'\b(' + '|'.join(positives) + r')\b', r'<span style="color:green">\1</span>', text)
    return text

In [None]:
fileid = sonnetsSentimentFreqs.max() # most positive
text = sonnetsCorpus.raw(fileid)
html = get_html_for_sentiment_data(text, sonnetsPositives, sonnetsNegatives)

In [None]:
from IPython.display import HTML
HTML("<h2>" + fileid + "</h2><pre>" + html + "</pre>")

In [None]:
fileid = sonnetsSentimentFreqs.most_common()[-1][0] # most negative (fileid of the last element in the most common list)
text = sonnetsCorpus.raw(fileid)
html = get_html_for_sentiment_data(text, sonnetsPositives, sonnetsNegatives)
HTML("<h2>" + fileid + "</h2><pre>" + html + "</pre>")

Exercise (if time): In the cell below, write some code that will loop through all the sonnets and print them all in color-coded HTML, as above.