In [1]:
# read in some helpful libraries
import nltk # the natural langauage toolkit, open-source NLP
import pandas as pd # dataframes
import numpy as np 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from collections import Counter
from scipy import stats
import string

# initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [2]:
### Read our train data into a dataframe
texts = pd.read_csv("raw_data/train_full.csv", encoding = 'latin-1')

# look at the first few rows of texts
texts.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
# read our test data into a dataframe 
test = pd.read_csv("raw_data/test_full.csv", encoding = 'latin-1')

# look at the first few rows of test
test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [4]:
# split the data by author
byAuthor = texts.groupby("author")

# create empty dictionary to store combined sentences for each author
sentence_dict = {}

# create empty dictionary to store tokenized sentences for each author
token_dict = {}

# for each author...
for name, group in byAuthor:
    # get all of the sentences they wrote and collapse them into a
    # single long string
    sentences = group['text'].str.cat(sep = ' ')
    
    # convert everything to lower case (so "The" and "the" get counted as 
    # the same word rather than two different words)
    sentences = sentences.lower()
    
    sentence_dict[name] = (sentences)
    
    # split the text into individual tokens (words)  
    tokens = nltk.tokenize.word_tokenize(sentences)
    
    token_dict[name] = (tokens)

In [6]:
# create empty dictionary to populate with frequency distribution of words by author
wordFreqByAuthor = nltk.probability.ConditionalFreqDist()

# for each author...
for key, value in token_dict.items():
    
    # calculate the frequency of each token
    frequency = nltk.FreqDist(value)

    # add the frequencies for each author to our dictionary
    wordFreqByAuthor[key] = (frequency)
    
    # now we have a dictionary where each entry is the frequency distrbution
    # of words for a specific author.    

In [8]:
# assigning probablities to each sentence for each author indicating the likelihood that each author wrote the sentence

# create empty dataframe
ProbabilitiesByAuthor = pd.DataFrame()

winnerBySentence = []

# for each sentence in the test df...
for x in np.arange(0, len(test['text'])):
    
    # store the sentence
    testSentence = test['text'][x]

    # lowercase & tokenize sentence
    preProcessedTestSentence = nltk.tokenize.word_tokenize(testSentence.lower())

    # create an empy dataframe to put our output in
    testProbabilities = pd.DataFrame(columns = ['author','word','probability'])

    # For each author...
    for i in wordFreqByAuthor.keys():
        # for each word in our test sentence...
        for j  in preProcessedTestSentence:
            # find out how frequently the author used that word
            wordFreq = wordFreqByAuthor[i].freq(j)
            # and add a very small amount to every prob. so none of them are 0
            smoothedWordFreq = wordFreq + 0.000001
            # add the author, word and smoothed freq. to our dataframe
            output = pd.DataFrame([[i, j, smoothedWordFreq]], columns = ['author','word','probability'])
            testProbabilities = testProbabilities.append(output, ignore_index = True)
    
        # empty dataframe for the probability that each author wrote the sentence
    testProbabilitiesByAuthor = pd.DataFrame()

    # now let's group the dataframe with our frequency by author
    for i in wordFreqByAuthor.keys():
        # get the joint probability that each author wrote each sentence
        oneAuthor = testProbabilities.query('author == "' + i + '"')
        jointProbability = oneAuthor.product(numeric_only = True)[0]

        # and add that to our dataframe
        output = pd.DataFrame([[jointProbability]])
        testProbabilitiesByAuthor = testProbabilitiesByAuthor.append(output, ignore_index = True)
        
    ProbabilitiesByAuthor = pd.concat([ProbabilitiesByAuthor, testProbabilitiesByAuthor], axis=1)
    
    # and our winner is...
    winnerBySentence.append(testProbabilitiesByAuthor.idxmax()[0])

In [10]:
ProbabilitiesByAuthor = ProbabilitiesByAuthor.rename(index = ({0: 'EAP', 1:'HPL', 2: 'MWS'})).transpose()
ProbabilitiesByAuthor

Unnamed: 0,EAP,HPL,MWS
0,4.663878e-66,3.689544e-67,2.962311e-62
0,2.561907e-195,8.552745e-203,5.857305e-205
0,3.879998e-113,1.820016e-110,4.388748e-118
0,1.821917e-132,7.366460e-132,8.216627e-141
0,2.627913e-35,2.576906e-37,6.739408e-37
0,2.568467e-115,1.788775e-117,2.469855e-120
0,4.349804e-39,1.469934e-40,5.345836e-41
0,3.477608e-93,6.007603e-92,3.401735e-88
0,4.145512e-84,1.322378e-89,2.775326e-96
0,2.636376e-27,1.499354e-29,9.058813e-29


In [11]:
winnerBySentence

authorship = []

for a in winnerBySentence:
    if a == 0:
        authorship.append('EAP')
    elif a == 1:
        authorship.append('HPL')
    else:
        authorship.append('MWS')

authorship

['MWS',
 'EAP',
 'HPL',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'HPL',
 'HPL',
 'HPL',
 'EAP',
 'MWS',
 'MWS',
 'EAP',
 'MWS',
 'HPL',
 'EAP',
 'EAP',
 'MWS',
 'HPL',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'HPL',
 'EAP',
 'HPL',
 'HPL',
 'EAP',
 'MWS',
 'HPL',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'HPL',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'MWS',
 'MWS',
 'HPL',
 'HPL',
 'MWS',
 'EAP',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'HPL',
 'MWS',
 'EAP',
 'EAP',
 'HPL',
 'MWS',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'HPL',
 'EAP',
 'MWS',
 'EAP',
 'HPL',
 'EAP',
 'EAP',
 'MWS',
 'HPL',
 'MWS',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'EAP',
 'MWS',
 'HPL',
 'HPL',
 'MWS',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'MWS',
 'MWS',
 'MWS',
 'EAP',
 'HPL',
 'MWS',
 'EAP',
 'MWS',
 'HPL',
 'HPL',
 'HPL',
 'MWS',
 'EAP',
 'HPL',
 'EAP',
 'EAP',
 'EAP',
 'MWS',
 'MWS',
 'EAP',
 'HPL',


In [13]:
ProbabilitiesByAuthor.to_csv('probabilities_by_author_and_sentence.csv')

In [15]:
test['author'] = authorship

test.to_csv('test_with_author.csv')