In [165]:
import numpy as np
import nltk
import glob
import os
import ntpath
import re
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.vq import whiten
%matplotlib inline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [166]:
# Load data
papers = r"/Users/jonlucadecaro/Documents/Other/Federalist-Papers-NLP/papers"
hamilton = sorted(glob.glob(os.path.join(papers, "hamilton/*")))
madison = sorted(glob.glob(os.path.join(papers, "madison/*")))
disputed = sorted(glob.glob(os.path.join(papers, "disputed/*")))

hamilton_papers = []
for fn in hamilton:
    with open(fn) as f:
        hamilton_papers.append(f.read().replace('\n', ' ').replace('\r',''))
hamilton_papers_all = ' '.join(hamilton_papers)

madison_papers = []
for fn in madison:
    with open(fn) as f:
        madison_papers.append(f.read().replace('\n', ' ').replace('\r',''))
madison_papers_all = ' '.join(madison_papers)

disputed_papers = []
disputed_papers_file_names = []
for fn in disputed:
    with open(fn) as f:
        disputed_papers.append(f.read().replace('\n', ' ').replace('\r',''))
        disputed_papers_file_names.append(ntpath.basename(fn))
disputed_papers_all = ' '.join(disputed_papers)

known_papers_all = hamilton_papers_all + " " + madison_papers_all
known_papers = hamilton_papers + madison_papers

In [167]:
def LexicalFeatures(papers, all_papers):
    """
    Compute feature vectors for word and punctuation features
    """
    num_papers = len(papers)
    fvs_lexical = np.zeros((len(papers), 2), np.float64)
    fvs_punct = np.zeros((len(papers), 3), np.float64)
    for e, single_paper_text in enumerate(papers):
        # note: the nltk.word_tokenize includes punctuation
        tokens = nltk.word_tokenize(single_paper_text.lower())
        words = word_tokenizer.tokenize(single_paper_text.lower())
        sentences = sentence_tokenizer.tokenize(single_paper_text)
        vocab = set(words)
        words_per_sentence = np.array([len(word_tokenizer.tokenize(s)) for s in sentences])

        # average number of words per sentence
        fvs_lexical[e, 0] = words_per_sentence.mean()
        # Lexical diversity
        fvs_lexical[e, 1] = len(vocab) / float(len(words))

        # Commas per sentence
        fvs_punct[e, 0] = tokens.count(';') / float(len(sentences))
        fvs_punct[e, 1] = tokens.count('"') / float(len(sentences))
        fvs_punct[e, 2] = tokens.count(',') / float(len(sentences))
        

    # apply whitening to decorrelate the features
    fvs_lexical = whiten(fvs_lexical)
    fvs_punct = whiten(fvs_punct)

    return fvs_lexical, fvs_punct

In [251]:
def SyntacticFeatures(papers, all_papers):
    """
    Extract feature vector for part of speech frequencies
    """
    def token_to_pos(paper):
        tokens = nltk.word_tokenize(paper)
        return [p[1] for p in nltk.pos_tag(tokens)]

    paper_pos = [token_to_pos(paper) for paper in papers]
    pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
    fvs_syntax = np.array([[paper.count(pos) for pos in pos_list] for paper in paper_pos]).astype(np.float64)

    # normalise by dividing each row by number of tokens in the paper
    fvs_syntax /= np.c_[np.array([len(paper) for paper in paper_pos])]

    return fvs_syntax


In [252]:
def PredictAuthors(fvs):
    km = KMeans(n_clusters=2, init='k-means++', n_init=100, max_iter=300, verbose=0)
    km.fit(fvs)
    return km


In [253]:

known_set = list(LexicalFeatures(known_papers, known_papers_all))
known_set.append(SyntacticFeatures(known_papers, known_papers_all))

classifications = [PredictAuthors(fvs) for fvs in known_set]




In [254]:
# Get FVS/normalized data to predict from disputed papers
disputed_set = list(LexicalFeatures(disputed_papers, disputed_papers_all))
disputed_set.append(SyntacticFeatures(disputed_papers, disputed_papers_all))



In [255]:
results = list()
results.append([classifications[0].predict(disputed_set[0]),"Lexical Features"]) # Predict results of Lexical Features
results.append([classifications[1].predict(disputed_set[1]),"Lexical Features - Punctuation"]) # Predict results of Lexical Features, Punctuation
results.append([classifications[2].predict(disputed_set[2]),"Syntactic Features"]) # Predict results of their syntactic feature

In [256]:
all_results = []
for i in range(len(classifications)):
    hamilton = classifications[i].labels_[0] # Extract first
    individual_classifier_results = []
    for j in range(len(results[i][0])):
        if results[i][0][j] == hamilton: # We know for a fact Hamilton wrote the first paper, so use that as an index
            individual_classifier_results.append("Hamilton")
        else:
            individual_classifier_results.append("Madison")
    print(individual_classifier_results)
    all_results.append(individual_classifier_results)

['Hamilton', 'Hamilton', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Hamilton', 'Madison']
['Hamilton', 'Hamilton', 'Hamilton', 'Hamilton', 'Madison', 'Madison', 'Madison', 'Hamilton', 'Hamilton', 'Hamilton', 'Madison', 'Madison']
['Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Hamilton', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison']


In [257]:
# Print csv results
print(",",','.join(disputed_papers_file_names))
for i in range(len(results)):
    print(results[i][1],",",','.join(all_results[i]))
        

, 49.txt,50.txt,51.txt,52.txt,53.txt,54.txt,55.txt,56.txt,57.txt,58.txt,62.txt,63.txt
Lexical Features , Hamilton,Hamilton,Madison,Madison,Madison,Madison,Madison,Madison,Madison,Madison,Hamilton,Madison
Lexical Features - Punctuation , Hamilton,Hamilton,Hamilton,Hamilton,Madison,Madison,Madison,Hamilton,Hamilton,Hamilton,Madison,Madison
Syntactic Features , Madison,Madison,Madison,Madison,Madison,Hamilton,Madison,Madison,Madison,Madison,Madison,Madison


In [None]:
### OLD WORK ### - saved just in case, but plays no part in above work
# original work with attribution from main dataset, not used in rest of notebook
files = open("The-Federalist-Papers.txt")

lines = files.read()
files.close()

elem = re.split(r'FEDERALIST\.? No\.?', lines)
dataset = pd.DataFrame(columns=['author', 'paper'])

for i in range(len(elem)):
    paper = elem[i]  # get paper
    paper = paper.replace("\n", " ")  # strip newlines
    paper = paper.replace("\r", " ")  # strip windows return
    paper_body = paper[paper.find("To the People of the State of New York: ") + 40:]  # get body

    tokens = word_tokenize(paper_body)  # tokenize
    tokens = [w.lower() for w in tokens]  # convert to lower case
    table = str.maketrans('', '', string.punctuation)  # remove punctuation from each word
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    if "MADISON" in paper and "JAY" in paper:
        dataset = dataset.append({"author": "Madison and Jay", "paper": paper_body}, ignore_index=True)
    elif "MADISON" in paper and "HAMILTON" in paper:
        dataset = dataset.append({"author": "Madison and Hamilton", "paper": paper_body}, ignore_index=True)
    elif "HAMILTON" in paper and "JAY" in paper:
        dataset = dataset.append({"author": "Hamilton and Jay", "paper": paper_body}, ignore_index=True)
    elif "MADISON" in paper:
        dataset = dataset.append({"author": "Madison", "paper": paper_body}, ignore_index=True)
    elif "JAY" in paper:
        dataset = dataset.append({"author": "Jay", "paper": paper_body}, ignore_index=True)
    elif "HAMILTON" in paper:
        dataset = dataset.append({"author": "Hamilton", "paper": paper_body}, ignore_index=True)
