In [1]:
# read in some helpful libraries
import nltk # the natural langauage toolkit, open-source NLP
import pandas as pd
import numpy as np 

In [2]:
### Read our train data into a dataframe
texts = pd.read_csv("Data_Raw/train_full.csv", encoding = 'latin-1')

# look at the first few rows of texts
texts.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
# read our test data into a dataframe 
test = pd.read_csv("Data_Raw/test_full.csv", encoding = 'latin-1')

# look at the first few rows of test
test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [4]:
# split the data by author
byAuthor = texts.groupby("author")

# create empty dictionary to store combined sentences for each author
sentence_dict = {}

# create empty dictionary to store tokenized sentences for each author
token_dict = {}

# for each author...
for name, group in byAuthor:
    # get all of the sentences they wrote and collapse them into a
    # single long string
    sentences = group['text'].str.cat(sep = ' ')
    
    # convert everything to lower case (so "The" and "the" get counted as 
    # the same word rather than two different words)
    sentences = sentences.lower()
    
    sentence_dict[name] = (sentences)
    
    # split the text into individual tokens (words)  
    tokens = nltk.tokenize.word_tokenize(sentences)
    
    token_dict[name] = (tokens)

In [5]:
# create empty dictionary to populate with frequency distribution of words by author
wordFreqByAuthor = nltk.probability.ConditionalFreqDist()

# for each author...
for key, value in token_dict.items():
    
    # calculate the frequency of each token
    frequency = nltk.FreqDist(value)

    # add the frequencies for each author to our dictionary
    wordFreqByAuthor[key] = (frequency)
    
    # now we have a dictionary where each entry is the frequency distrbution
    # of words for a specific author.    

In [6]:
# assigning probablities to each sentence for each author indicating the likelihood that each author wrote the sentence

# create empty dataframe
ProbabilitiesByAuthor = pd.DataFrame()

winnerBySentence = []

# for each sentence in the test df...
for x in np.arange(0, len(test['text'])):
    
    # store the sentence
    testSentence = test['text'][x]
    
    # store the id
    testID = test['id'][x]

    # lowercase & tokenize sentence
    preProcessedTestSentence = nltk.tokenize.word_tokenize(testSentence.lower())

    # create an empy dataframe to put our output in
    testProbabilities = pd.DataFrame(columns = ['author','word','probability'])

    # For each author...
    for i in wordFreqByAuthor.keys():
        # for each word in our test sentence...
        for j  in preProcessedTestSentence:
            # find out how frequently the author used that word
            wordFreq = wordFreqByAuthor[i].freq(j)
            # and add a very small amount to every prob. so none of them are 0
            smoothedWordFreq = wordFreq + 0.000001
            # add the author, word and smoothed freq. to our dataframe
            output = pd.DataFrame([[i, j, smoothedWordFreq]], columns = ['author','word','probability'])
            testProbabilities = testProbabilities.append(output, ignore_index = True)
    
        # empty dataframe for the probability that each author wrote the sentence
    testProbabilitiesByAuthor = pd.DataFrame()

    # now let's group the dataframe with our frequency by author
    for i in wordFreqByAuthor.keys():
        # get the joint probability that each author wrote each sentence
        oneAuthor = testProbabilities.query('author == "' + i + '"')
        jointProbability = oneAuthor.product(numeric_only = True)[0]

        # and add that to our dataframe
        output = pd.DataFrame([[jointProbability]])
        testProbabilitiesByAuthor = testProbabilitiesByAuthor.append(output, ignore_index = True)
        
    ProbabilitiesByAuthor = pd.concat([ProbabilitiesByAuthor, testProbabilitiesByAuthor], axis=1)
    
    # and our winner is...
    winnerBySentence.append(testProbabilitiesByAuthor.idxmax()[0])

In [7]:
ProbabilitiesByAuthor = ProbabilitiesByAuthor.rename(index = ({0: 'EAP', 1:'HPL', 2: 'MWS'})).transpose()
ProbabilitiesByAuthor

Unnamed: 0,EAP,HPL,MWS
0,4.663878e-66,3.689544e-67,2.962311e-62
0,2.561907e-195,8.552745e-203,5.857305e-205
0,3.879998e-113,1.820016e-110,4.388748e-118
0,1.821917e-132,7.366460e-132,8.216627e-141
0,2.627913e-35,2.576906e-37,6.739408e-37
0,2.568467e-115,1.788775e-117,2.469855e-120
0,4.349804e-39,1.469934e-40,5.345836e-41
0,3.477608e-93,6.007603e-92,3.401735e-88
0,4.145512e-84,1.322378e-89,2.775326e-96
0,2.636376e-27,1.499354e-29,9.058813e-29


In [8]:
ProbabilitiesByAuthor = ProbabilitiesByAuthor.reset_index()
ProbabilitiesByAuthor = ProbabilitiesByAuthor.drop(['index'],axis= 1)
ProbabilitiesByAuthor

Unnamed: 0,EAP,HPL,MWS
0,4.663878e-66,3.689544e-67,2.962311e-62
1,2.561907e-195,8.552745e-203,5.857305e-205
2,3.879998e-113,1.820016e-110,4.388748e-118
3,1.821917e-132,7.366460e-132,8.216627e-141
4,2.627913e-35,2.576906e-37,6.739408e-37
5,2.568467e-115,1.788775e-117,2.469855e-120
6,4.349804e-39,1.469934e-40,5.345836e-41
7,3.477608e-93,6.007603e-92,3.401735e-88
8,4.145512e-84,1.322378e-89,2.775326e-96
9,2.636376e-27,1.499354e-29,9.058813e-29


In [9]:
ProbabilitiesByAuthor['id'] = pd.Series(test['id'])
cols = ['id', 'EAP', 'HPL', 'MWS']
ProbabilitiesByAuthor = ProbabilitiesByAuthor[cols]
ProbabilitiesByAuthor

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,4.663878e-66,3.689544e-67,2.962311e-62
1,id24541,2.561907e-195,8.552745e-203,5.857305e-205
2,id00134,3.879998e-113,1.820016e-110,4.388748e-118
3,id27757,1.821917e-132,7.366460e-132,8.216627e-141
4,id04081,2.627913e-35,2.576906e-37,6.739408e-37
5,id27337,2.568467e-115,1.788775e-117,2.469855e-120
6,id24265,4.349804e-39,1.469934e-40,5.345836e-41
7,id25917,3.477608e-93,6.007603e-92,3.401735e-88
8,id04951,4.145512e-84,1.322378e-89,2.775326e-96
9,id14549,2.636376e-27,1.499354e-29,9.058813e-29


In [10]:
colsum = ProbabilitiesByAuthor.loc[:,['EAP', 'HPL', 'MWS']].sum(1)
colsum

0        2.962815e-62
1       2.561907e-195
2       1.823896e-110
3       9.188377e-132
4        2.721076e-35
5       2.586379e-115
6        4.550256e-39
7        3.402371e-88
8        4.145526e-84
9        2.741958e-27
10      3.245322e-157
11      1.230083e-149
12      2.925327e-113
13       2.548473e-84
14       1.941551e-43
15       3.706083e-27
16       1.126633e-59
17      5.220365e-110
18       6.820671e-79
19       7.190671e-77
20       4.614457e-46
21      4.204624e-100
22       1.031940e-33
23       6.367071e-35
24      3.253810e-110
25       9.814805e-36
26       4.133833e-77
27       7.218466e-54
28      2.126694e-125
29      3.348197e-182
            ...      
8362     9.590787e-59
8363     5.864750e-59
8364     1.552295e-31
8365     3.313144e-60
8366    1.375593e-133
8367    1.568981e-122
8368     1.804837e-85
8369     2.167602e-88
8370    3.112201e-113
8371     3.122134e-31
8372     1.920823e-30
8373    2.161961e-120
8374     1.090039e-52
8375     2.228306e-58
8376    3.

In [11]:
ProbabilitiesScaled = ProbabilitiesByAuthor.copy()
ProbabilitiesScaled

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,4.663878e-66,3.689544e-67,2.962311e-62
1,id24541,2.561907e-195,8.552745e-203,5.857305e-205
2,id00134,3.879998e-113,1.820016e-110,4.388748e-118
3,id27757,1.821917e-132,7.366460e-132,8.216627e-141
4,id04081,2.627913e-35,2.576906e-37,6.739408e-37
5,id27337,2.568467e-115,1.788775e-117,2.469855e-120
6,id24265,4.349804e-39,1.469934e-40,5.345836e-41
7,id25917,3.477608e-93,6.007603e-92,3.401735e-88
8,id04951,4.145512e-84,1.322378e-89,2.775326e-96
9,id14549,2.636376e-27,1.499354e-29,9.058813e-29


In [12]:
ProbabilitiesScaled['EAP'] = ProbabilitiesByAuthor['EAP']/colsum
ProbabilitiesScaled['HPL'] = ProbabilitiesByAuthor['HPL']/colsum
ProbabilitiesScaled['MWS'] = ProbabilitiesByAuthor['MWS']/colsum
ProbabilitiesScaled

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,1.574138e-04,1.245284e-05,9.998301e-01
1,id24541,1.000000e+00,3.338430e-08,2.286307e-10
2,id00134,2.127314e-03,9.978727e-01,2.406249e-08
3,id27757,1.982850e-01,8.017150e-01,8.942413e-10
4,id04081,9.657624e-01,9.470173e-03,2.476744e-02
5,id27337,9.930743e-01,6.916137e-03,9.549470e-06
6,id24265,9.559471e-01,3.230442e-02,1.174843e-02
7,id25917,1.022113e-05,1.765711e-04,9.998132e-01
8,id04951,9.999968e-01,3.189891e-06,6.694750e-13
9,id14549,9.614941e-01,5.468190e-03,3.303776e-02


In [13]:
winnerBySentence

authorship = []

for a in winnerBySentence:
    if a == 0:
        authorship.append('EAP')
    elif a == 1:
        authorship.append('HPL')
    else:
        authorship.append('MWS')

In [14]:
ProbabilitiesScaled.to_csv('Data_Output/probabilities.csv', index = False)

In [15]:
test['author'] = authorship

test.to_csv('Data_Output/test_with_author.csv', index = False)