# The words that fueled the American Revolution: Sentiment Analysis on Thomas Paine's _Common Sense_




In [1]:
# import packages
import numpy as np
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
import re
from urllib.parse import urlparse
import urllib.robotparser
from bs4 import BeautifulSoup

# This code checks the robots.txt file
def canFetch(url):

    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(domain + "/robots.txt")
    try:
        rp.read()
        canFetchBool = rp.can_fetch("*", url)
    except:
        canFetchBool = None
    
    return canFetchBool

# Data Wrangling and Cleaning

In [2]:
# Loading in the text file
common_sense = [x.strip() for x in open("common_sense.txt", encoding='utf8').readlines()]

In [3]:
# Formatting text
new_list = []
word_by_word = " ".join(common_sense).split(" ")
# Filtering out short words
for word in word_by_word:
    if len(word) > 4:
        new_list.append(word)
no_short_words = " ".join(new_list)
# Cutting out excess text not in actual pamphlet
start = 881
end = -15841
cs = no_short_words[start:end]
# Removing punctuation, commas, semicolons, dashes, colons, parenthesis, brackets, extra quotations
cs = cs.replace(",", "").replace(";", "").replace(".", "")\
.replace("(", "").replace(")", "").replace("-", "").replace\
(":", "").replace("--", "").replace("?", "").replace("!", "")\
.replace("[", "").replace("]", "").replace('"', '').replace("'", "")
# lowercasing all words for simplicity
cs = cs.lower() #the whole text with no punctuation or marks


# Couting Words, Creating a Dictionary, Finding Common Words, and Searching for Particular Words

In [4]:
# Counting total words
cs_list_words = cs.split(" ") # Splitting list into words
total_words = len(cs_list_words) # Finding length of list (total words)
# Counting distinct words
distinct_split_words = len(set(cs_list_words)) # Filtering list to distinct words
total_words, distinct_split_words

(8724, 3130)

In [5]:
# Creating dictionary of words
cs_dictionary = {} # empty dictionary
for word in cs_list_words:
    if word in cs_dictionary and len(word) > 0: # If word is already in dictionary, add one to the count or value
        cs_dictionary[word] = cs_dictionary[word] + 1 
    elif len(word) > 0:
        cs_dictionary[word] = 1 # If word is not already in dictionary, start with value 1
#cs_dictionary = a dictionary of every word and its count

In [6]:
# Most common words
values = list(cs_dictionary.values()) # List of values
keys = list(cs_dictionary.keys()) # List of keys
values_sorted = sorted(values, reverse = True) # Sorted values
max_values = values_sorted[:10] # Number of times top 10 most common words are used
max_values
for i in keys:
    if cs_dictionary[i] in max_values: # Looping through dictionary to find word used the most
        print(i) # The top 10 most common words over 4 letters... that doesn't tell us much!

being
first
power
which
england
their
america
government
other
without
would
continent


# Searching for how many times particular words are used

In [8]:
cs_dictionary["power"]

44

In [9]:
cs_dictionary["britain"]

42

In [10]:
cs_dictionary["monarchy"]

14

In [11]:
cs_dictionary["freedom"]

9

In [12]:
cs_dictionary["america"]

46

In [13]:
cs_dictionary["suffer"]

12

In [14]:
cs_dictionary["england"]

53

# Sentiment Analysis

In [15]:
# Importing nltk for Sentiment Analysis
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/kjadbaba/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:
# Creating a dictionary to hold the sentiment analysis of each word in particular
sentiment_list = []
for i in keys: # Looping through keys
    word_analysis = sid.polarity_scores(i) # sentiment analysis
    word_analysis['word'] = i # adding extra category to dictionary
    sentiment_list.append(word_analysis) # appending to the list
#sentiment_list = a list of dictionaries of the sentiment of every word

In [17]:
import pandas as pd
cs_ws = pd.DataFrame(sentiment_list) # Making a table
worst_words = cs_ws.loc[cs_ws["compound"] <= -.6] # Filtering out compounds greater than -.6
worst_words_sort = worst_words.sort_values("compound") # sorting by smallest compound
worst_words_sort

Unnamed: 0,compound,neg,neu,pos,word
1285,-0.7003,1.0,0.0,0.0,slavery
2177,-0.6808,1.0,0.0,0.0,murderer
200,-0.6597,1.0,0.0,0.0,evil
747,-0.6597,1.0,0.0,0.0,devil
1704,-0.6486,1.0,0.0,0.0,murderers
3052,-0.6486,1.0,0.0,0.0,murdering
32,-0.6369,1.0,0.0,0.0,abuse
1185,-0.6369,1.0,0.0,0.0,betray
2123,-0.6369,1.0,0.0,0.0,hellish
2738,-0.6369,1.0,0.0,0.0,hatred


In [18]:
best_words = cs_ws.loc[cs_ws["compound"] >= .6] # Filtering out compounds less than .6
best_words_sort = best_words.sort_values("compound", ascending = False) # sorting
best_words_sort

Unnamed: 0,compound,neg,neu,pos,word
220,0.6369,0.0,0.0,1.0,paradise
250,0.6369,0.0,0.0,1.0,greatest
322,0.6369,0.0,0.0,1.0,perfectly
426,0.6369,0.0,0.0,1.0,freedom
454,0.6369,0.0,0.0,1.0,glorious
1673,0.6369,0.0,0.0,1.0,best
1679,0.6369,0.0,0.0,1.0,love
95,0.6249,0.0,0.0,1.0,great
760,0.6124,0.0,0.0,1.0,splendor


In [19]:
total = sid.polarity_scores(cs) # Sentiment Analysis on text as a whole (one giant sentence)
total_neg = total['neg'] 
total_pos = total['pos']
total_neu = total['neu']
(total_neg, total_pos, total_neu) # comparing results of negative vs positive words

(0.117, 0.156, 0.728)

In [20]:
compounds = list(cs_ws.sort_values('word').loc[: , "compound"]) # First sorting the analyzed table (in alphabetical order), then extracting the compound column
alphabetical_keys = list(sorted(cs_dictionary.keys())) # Sorting keys of dictionary of words
values_sorted = [value for (key, value) in sorted(cs_dictionary.items())] # Sorting values based on sorted keys
sorted_dictionary = dict(zip(alphabetical_keys, values_sorted)) # creating a sorted dictionary (mainly to assure everything sorted correctly)
dict_values = list(sorted_dictionary.values()) # Values from new dictionary
#len(compounds) == len(dict_values) is true
products = [a * b for a, b in zip(dict_values, compounds)] # multiplying lists together
sum_products = sum(products) # summing together all the multiplied values
round(sum_products)

83

   Overall, a simple sentiment analysis on Thomas Paine's Common Sense can provide a unique understanding on numerous aspects of the pamphlet without reading a single word of it. To begin, I downloaded the text file and cleaned it. Because it is an ebook, it had random text about the website, use-restrictions, and other things that will be found in the first few pages of a book (before the text actually begins). To rid of the excess texts, I manually sliced the string so that it would start with the first word of the text and end with the last word in the actual book. Next, I cut out all words four characters or shorter, lowercased every word, and filtered out any types of punctuation or marks, leaving one long paragraph of every word (or number) separated by a space. After that, I found the amount of words, amount of distinct words, and created a dictionary of every word (key) and how many times that word was used (value). After cleaning the text and creating a dictionary, I analyzed some interesting questions about Common Sense and its overall sentiment.
   
   First, I found that the most common words over four characters don't display much about the text. The words like "which", "their", "other", and "could", for example, are of the top ten most common words, yet are mostly neutral and don't have any significant meaning. On the other hand, some of the other words, like "government", "america", "power", and "england" portray how the text has something to do along the lines of government and power in America and England. Second, I wanted to look at the most negative and positive words used by Thomas Paine, and, through his choice of words, try to gain insight on the rhetoric of the text. The negative words, like "slavery", "murderer", and "evil" with the most negative compounds display how Thomas Paine believed the British power and control over America was like slavery, and that he thinks of the British people as evil murderers. In contrast, the positive words, like "paradise", "greatest", and "freedom", are likely used to describe America, the American people, and the potential and future of America if the country become free from England's control. Through this simple sentiment analysis of the individual words in Common Sense, we can see how Thomas Paine was trying to persuade the Americans to stand up against the power of the British, and specifically what words he used to accomplish that goal. Still, this analysis is limited as the words are out of context, and does not give a full conlusion on the overall sentiment of the text. To accomplish that question, I used two different methods. In the first method, I ran the sentiment analyzer on the whole text as one large sentence, and found that it is slightly more positive than negative, but more neutral than anything. The fact that most of the words are neutral makes sense in that the analyzer takes each word at a time, so most words in any text will most probably be neutral. To validate this first method, I sought a way to express the sentiment as a value; the more positive the value, the more positive the overall sentiment of the text, and vice versa. To do this, I took the sum of each compound multiplied by the amount of times the word was used. For example, the sentence "bad bad bad good good" will be represented by a negative value because the compound of bad will be multipled by three and good by two. At the same time, a word that is more on the ends of the spectrum, like "slavery" will have a larger impact on the value than a word like "bad". As another example, the sentence "slavery slavery good good good" could be positive or negative, depending on the compounds of each word. As a whole, using this second method, I found that the value rounded to the nearest whole number is 83, so it can be concluded, by the consistency of both methods, that the overall sentiment is more positive than negative. Furthermore, a lot of information on Thomas Paine's Common Sense can be gathered and understood through all of the unique analysis completed. While there are many limitations to the analysis including the accuracy of the sentiment analyzer (for example, "not good" will be seen as positive), we can see that Thomas Paine's overall sentiment was positive, which could imply that he focused on praising America and uniting and galvanizing the people on the greatness of the country, the people, and its need for independence from England (my interpretation from the analysis). In conclusion, this sentiment analysis on Thomas Paine's Common Sense demonstrates many interesting aspects and statistics about the text which can be understood without reading it.