In [60]:
##Text Analysis - Cosine Similarity/Other Text Analysis

# ----------------------------------------------------------------------------
#                                Purpose
# ----------------------------------------------------------------------------
# The purpose of this script is do a text analysis of the 
# the 85 Essays in the Federalist Papers. By doing so we can figure out who wrote
# the unknown essays and how related these essays are to one another. 
#
# ## Import Libraries
import pandas as pd
import nltk
import os
import requests


In [61]:
# ----------------------------------------------------------------------------
#                       Combining all files together 
# ----------------------------------------------------------------------------
#%% Loop through our data folder (note: in Spyder you'll have to open a project
# in our working directory to do this) and read in all of our files
# Initialize an empty dataframe to hold all of our text

import os

text_df = pd.DataFrame(columns = ["lines", "essay"])
files = os.listdir("C:\\Users\\sabri\\Desktop\\Federalist\\Data\\")

# Loop through all text files and read them into singular dataframe
for text_file in files:
    
    print(f"\n============================")
    print(f"About to load {text_file} in")
    print(f"============================")
        
    temp_df = pd.read_csv('C:\\Users\\sabri\\Desktop\\Federalist\\Data\\' + text_file, # Read the text file in our data folder
                          delimiter = '\n', # Looks like every line ends with \n
                          header = None, # Read from the first line
                          names = ["lines"], # Column header
                          error_bad_lines = False) # Skip bad lines (there are like 6 across all essays)
    
    # Add the file name in so we have the essay number for reference
    temp_df['essay'] = text_file
    
    # Append the temporary dataframe with all corresponding text into our master dataframe
    text_df = text_df.append(temp_df)




About to load essay01.txt in

About to load essay02.txt in

About to load essay03.txt in

About to load essay04.txt in

About to load essay05.txt in

About to load essay06.txt in

About to load essay07.txt in

About to load essay08.txt in

About to load essay09.txt in

About to load essay10.txt in

About to load essay11.txt in

About to load essay12.txt in

About to load essay13.txt in

About to load essay14.txt in

About to load essay15.txt in

About to load essay16.txt in

About to load essay17.txt in

About to load essay18.txt in

About to load essay19.txt in

About to load essay20.txt in

About to load essay21.txt in

About to load essay22.txt in

About to load essay23.txt in

About to load essay24.txt in

About to load essay25.txt in

About to load essay26.txt in

About to load essay27.txt in

About to load essay28.txt in

About to load essay29.txt in

About to load essay30.txt in

About to load essay31.txt in

About to load essay32.txt in

About to load essay33.txt in

About to 

In [62]:
#%% Data cleaning
# The essays come in in the format 'Essay22.txt', and we'd prefer if it just said 'Essay 22'
# Let's start by saving our dataframe as a new object
cleaned_df = text_df.copy()

# Remove .txt first
cleaned_df['essay'] = cleaned_df['essay'].str.replace('.txt', '')

# Put a space in between 'essay' and the number of the essay
cleaned_df['essay'] = cleaned_df['essay'].str.replace('essay', 'Essay ')

# We may also have None (NA) types in the dataframe. Let's drop these
cleaned_df['lines'].dropna(how = 'any')

# Reset the index column for reference
cleaned_df.reset_index(drop = True, inplace = True)

# Because I use a Mac, some rows are .DS_Store, so let's filter those out
cleaned_df_filtered = cleaned_df[cleaned_df['essay'] != '.DS_Store']

cleaned_df_filtered.head(10)

Unnamed: 0,lines,essay
0,AFTER an unequivocal experience of the ineffic...,Essay 01
1,"federal government, you are called upon to del...",Essay 01
2,Constitution for the United States of America....,Essay 01
3,own importance; comprehending in its consequen...,Essay 01
4,"existence of the UNION, the safety and welfare...",Essay 01
5,"is composed, the fate of an empire in many res...",Essay 01
6,in the world. It has been frequently remarked ...,Essay 01
7,"reserved to the people of this country, by the...",Essay 01
8,"to decide the important question, whether soci...",Essay 01
9,capable or not of establishing good government...,Essay 01


In [63]:
#%% Text Cleaning
# It's important to ensure that the text we analyze is clean. That is, no
# punctuation, everything lowercase, removal of stop words, etc.

cleaned_df_filtered['lines'] = cleaned_df_filtered['lines'].str.replace('[^A-z]', ' ').str.replace(' +', ' ').str.strip()
                                            
# TODO: remove stop words

from nltk.corpus import stopwords
stop = stopwords.words('english')

cleaned_df_filtered['lines'] = cleaned_df_filtered['lines'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# TODO: make everything lower case                
cleaned_df_filtered['lines'] = cleaned_df_filtered['lines'].apply(lambda x: " ".join(x.lower() for x in x.split()))


##Removing puncutation 
cleaned_df_filtered['lines'] = cleaned_df_filtered['lines'].str.replace('[^\w\s]','')


fed_papers = cleaned_df_filtered.copy()

fed_papers.head()


Unnamed: 0,lines,essay
0,after unequivocal experience inefficacy subsis...,Essay 01
1,federal government called upon deliberate new,Essay 01
2,constitution united states america the subject...,Essay 01
3,importance comprehending consequences nothing ...,Essay 01
4,existence union safety welfare parts,Essay 01


In [64]:
# ----------------------------------------------------------------------------
#                           Cosine Similarity
# ----------------------------------------------------------------------------
# The purpose of this section is do analyze the texts using cosine similarity.
# Cosine similarity is calculated by measuring the angle of the cosine between two vectors. 
# The smaller the angle, the higher the cosine similarity.
# Magnitute is not important in cosine similarity, only orientation. This is useful
# in text analysis because even if two documents have varying lengths, they could 
# still be related in terms of content. 
# In text analysis, this is what makes it more advantageous than other distance measures. 


In [65]:
#Grouping all of the lines by essay number

fed_papers = fed_papers.groupby("essay")
fed_papers= fed_papers["lines"].agg(lambda column: " ".join(column))

#Resetting the index 
fed_papers = fed_papers.reset_index(name="lines")

In [66]:
fed_papers["lines"][0]

'after unequivocal experience inefficacy subsisting federal government called upon deliberate new constitution united states america the subject speaks importance comprehending consequences nothing less existence union safety welfare parts composed fate empire many respects interesting world it frequently remarked seems reserved people country conduct example decide important question whether societies men really capable establishing good government reflection choice whether forever destined depend political constitutions accident force if truth remark crisis arrived may propriety regarded era decision made wrong election part shall act may view deserve considered general misfortune mankind this idea add inducements philanthropy patriotism heighten solicitude considerate good men must feel event happy choice directed judicious estimate true interests unperplexed unbiased considerations connected public good but thing ardently wished seriously expected the plan offered deliberations aff

In [67]:
fed_papers.head()

Unnamed: 0,essay,lines
0,Essay 01,after unequivocal experience inefficacy subsis...
1,Essay 02,when people america reflect called upon decide...
2,Essay 03,it is new observation people country like amer...
3,Essay 04,my last paper assigned several reasons safety ...
4,Essay 05,queen anne letter st july scotch parliament ma...


In [68]:
#Converts lines into a vectorized TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Transforms the data
fed_transform = TfidfVectorizer().fit_transform(fed_papers['lines'])

fed_transform

<85x8474 sparse matrix of type '<class 'numpy.float64'>'
	with 53895 stored elements in Compressed Sparse Row format>

In [69]:
#Slicing the matrix to get a submatrix in each row
#Let's take a look at the first Essay

fed_transform[0:1]

<1x8474 sparse matrix of type '<class 'numpy.float64'>'
	with 560 stored elements in Compressed Sparse Row format>

In [70]:
#Importing a linear kernal 
from sklearn.metrics.pairwise import linear_kernel

In [71]:
#Getting similarites through the linear kernal 
fed_cosine_similarities = linear_kernel(fed_transform[0:1],fed_transform).flatten()

In [72]:
fed_cosine_similarities

array([1.        , 0.24248331, 0.19453765, 0.18155295, 0.16440841,
       0.16952053, 0.20336079, 0.19757717, 0.21749066, 0.23962499,
       0.16014667, 0.17737461, 0.20392943, 0.18665427, 0.24441376,
       0.20683298, 0.20471895, 0.13234105, 0.13727825, 0.13873905,
       0.19290639, 0.2783429 , 0.23383181, 0.1591988 , 0.20336057,
       0.23546518, 0.18387218, 0.2058263 , 0.18554566, 0.23166577,
       0.23421106, 0.1341495 , 0.18373895, 0.1944032 , 0.19724486,
       0.24548596, 0.24105335, 0.2079633 , 0.21594254, 0.22841004,
       0.25932594, 0.16854812, 0.26163057, 0.197418  , 0.22636515,
       0.23890617, 0.1328758 , 0.17859182, 0.22283346, 0.13718231,
       0.23451095, 0.18325723, 0.16849247, 0.14707934, 0.18096885,
       0.15332521, 0.18900522, 0.22167091, 0.22794369, 0.22479624,
       0.19004653, 0.21576562, 0.21867444, 0.19455947, 0.20984302,
       0.18104205, 0.15143357, 0.1623089 , 0.15957013, 0.21567431,
       0.21527502, 0.18537786, 0.22028761, 0.14617154, 0.17451

This will show you the similarities between the first Essay and the other Essays. 
The range is from 0-1. 1 meaning they are the most similar, 0 meaning they are the least similar. 
As you can see the "1" in the first row and column is because it is comparing the first Essay to itself.

In [73]:
#Sorting the cosine similarities
#Let's look at the five most related Essays to the first Essay

related_essays = fed_cosine_similarities.argsort()[-7:-1]
related_essays

array([35, 40, 42, 21, 84, 83], dtype=int64)

In [14]:
#Cosine Similarity of the related essays
fed_cosine_similarities[related_essays]

array([1.        , 0.16333684, 0.15210208, 0.15195164, 0.1439729 ,
       0.14349407])

Essay 1 - Hamilton - For the Independent Journal - General Introduction

Excluding itself, the five most similar Essays to Essay 1 were the following: 

1. Essay 84 - Hamilton - From McLEAN's Edition - Certain General and Miscellaneous Objections to the Constitution Considered and Answered


2. Essay 85 - Hamilton - 	From McLEAN's Edition - Concluding Remarks   


3. Essay 22 - Hamilton - From the New York Packet  - The Same Subject Continued: Other Defects of the Present Confederation - Friday, December 14, 1787


4. Essay 43 - Madison -  	For the Independent Journal - 	The Same Subject Continued: The Powers Conferred by the Constitution Further Considered


5.	Essay 40 - Madison - From the New York Packet - The Powers of the Convention to Form a Mixed Government Examined and Sustained - Friday, January 18, 1788

In [15]:
#Let's take a look at one of the unknown Essays - Essay 52
fed_transform[51:52]

<1x22309 sparse matrix of type '<class 'numpy.float64'>'
	with 521 stored elements in Compressed Sparse Row format>

In [16]:
#Getting similarites through the linear kernal 
fed_cosine_similarities = linear_kernel(fed_transform[51:52],fed_transform).flatten()

In [17]:
fed_cosine_similarities

array([0.10098887, 0.08775587, 0.09312682, 0.07839288, 0.07084643,
       0.08459868, 0.09992395, 0.09663708, 0.09810563, 0.10657355,
       0.09400158, 0.09817362, 0.10045103, 0.10751882, 0.10415581,
       0.08504067, 0.08709252, 0.06735856, 0.07468257, 0.07719775,
       0.11025043, 0.12391947, 0.09887189, 0.09132512, 0.10950148,
       0.14016934, 0.07107975, 0.09496323, 0.10189658, 0.10032443,
       0.09431211, 0.08201221, 0.09436381, 0.10093062, 0.08404602,
       0.13376713, 0.12259353, 0.11500993, 0.18546175, 0.1491983 ,
       0.16740061, 0.11769523, 0.1498859 , 0.13555068, 0.14294402,
       0.15469904, 0.10048441, 0.12271427, 0.12560447, 0.08909412,
       0.12122357, 1.        , 0.19645533, 0.1389407 , 0.20415304,
       0.13279746, 0.15268227, 0.1431449 , 0.16905727, 0.12942176,
       0.15171226, 0.16428287, 0.1971695 , 0.09691149, 0.10820505,
       0.0969073 , 0.09813144, 0.10560274, 0.13353598, 0.09287387,
       0.12866832, 0.09822358, 0.1143181 , 0.06400234, 0.10183

In [18]:
#Sorting the cosine similarities
#Let's look at the five most related Essays to the 52nd Essay

related_essays = fed_cosine_similarities.argsort()[:-7:-1]
related_essays

array([51, 54, 62, 52, 38, 58], dtype=int64)

In [19]:
#Cosine Similarity of the related essays
fed_cosine_similarities[related_essays]

array([1.        , 0.20415304, 0.1971695 , 0.19645533, 0.18546175,
       0.16905727])

In [20]:
Essay 52 - Unknown - From the New York Packet - The House of Representatives - Friday, February 8, 1788


Excluding itself, the five most similar Essays to Essay 49 were the following: 

1. Essay 55 - Unknown - From the New York Packet - The Total Number of the House of Representatives - Friday, February 15, 1788


2. Essay 63 - Unknown - For the Independent Journal - The Senate 


3. Essay 53 - Unknown - From the New York Packet - The Same Subject Continued: The House of Representatives - Tuesday, February 12, 1788



4. Essay 39 - Madison -  For the Independent Journal - Conformity of the Plan to Republican Principles


5.Essay 59 - Hamilton - From the New York Packet Concerning the Power of Congress to Regulate the Election of Members - Friday, February 22, 1788


SyntaxError: invalid syntax (<ipython-input-20-c36dfd7a1db1>, line 1)

In [None]:
#To Do 
#Loop through the similarities, figure which Essays were related to each and put it into a dataframe. 
#Create a methodology where we sort which Essays were most similar to which author. 
#Example - In Essay 52, it was attributed once to Madison and once Hamilton in the top, the next author it is most similar to, 
#we attribute Essay 52 to that person. 
#Another methodology is to do a k-means clustering to group the Essays together. 