In [1]:
# read in some helpful libraries
import nltk # the natural langauage toolkit, open-source NLP
import pandas as pd
import numpy as np 

In [2]:
### Read our train data into a dataframe
texts = pd.read_csv("Data_Raw/train_90pct.csv", encoding = 'latin-1')

# look at the first few rows of texts
texts.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
# read our test data into a dataframe 
test = pd.read_csv("Data_Raw/train_10pct.csv", encoding = 'latin-1')

# look at the first few rows of test
test.head()

Unnamed: 0,id,text,author
0,id22106,"When I had finished, I felt satisfied that all...",EAP
1,id08387,It is also written that they descended one nig...,HPL
2,id23794,The ship and all in it are imbued with the spi...,EAP
3,id16925,"We had no means of calculating time, nor could...",EAP
4,id17295,"""I want whiskey good old fashioned rye"" exclai...",HPL


In [4]:
# split the data by author
byAuthor = texts.groupby("author")

# create empty dictionary to store combined sentences for each author
sentence_dict = {}

# create empty dictionary to store tokenized sentences for each author
token_dict = {}

# for each author...
for name, group in byAuthor:
    # get all of the sentences they wrote and collapse them into a
    # single long string
    sentences = group['text'].str.cat(sep = ' ')
    
    # convert everything to lower case (so "The" and "the" get counted as 
    # the same word rather than two different words)
    sentences = sentences.lower()
    
    sentence_dict[name] = (sentences)
    
    # split the text into individual tokens (words)  
    tokens = nltk.tokenize.word_tokenize(sentences)
    
    token_dict[name] = (tokens)

In [5]:
# create empty dictionary to populate with frequency distribution of words by author
wordFreqByAuthor = nltk.probability.ConditionalFreqDist()

# for each author...
for key, value in token_dict.items():
    
    # calculate the frequency of each token
    frequency = nltk.FreqDist(value)

    # add the frequencies for each author to our dictionary
    wordFreqByAuthor[key] = (frequency)
    
    # now we have a dictionary where each entry is the frequency distrbution
    # of words for a specific author.    

In [6]:
# assigning probablities to each sentence for each author indicating the likelihood that each author wrote the sentence

# create empty dataframe
ProbabilitiesByAuthor = pd.DataFrame()

winnerBySentence = []

# for each sentence in the test df...
for x in np.arange(0, len(test['text'])):
    
    # store the sentence
    testSentence = test['text'][x]
    
    # store the id
    testID = test['id'][x]

    # lowercase & tokenize sentence
    preProcessedTestSentence = nltk.tokenize.word_tokenize(testSentence.lower())

    # create an empy dataframe to put our output in
    testProbabilities = pd.DataFrame(columns = ['author','word','probability'])

    # For each author...
    for i in wordFreqByAuthor.keys():
        # for each word in our test sentence...
        for j  in preProcessedTestSentence:
            # find out how frequently the author used that word
            wordFreq = wordFreqByAuthor[i].freq(j)
            # and add a very small amount to every prob. so none of them are 0
            smoothedWordFreq = wordFreq + 0.000001
            # add the author, word and smoothed freq. to our dataframe
            output = pd.DataFrame([[i, j, smoothedWordFreq]], columns = ['author','word','probability'])
            testProbabilities = testProbabilities.append(output, ignore_index = True)
    
        # empty dataframe for the probability that each author wrote the sentence
    testProbabilitiesByAuthor = pd.DataFrame()

    # now let's group the dataframe with our frequency by author
    for i in wordFreqByAuthor.keys():
        # get the joint probability that each author wrote each sentence
        oneAuthor = testProbabilities.query('author == "' + i + '"')
        jointProbability = oneAuthor.product(numeric_only = True)[0]

        # and add that to our dataframe
        output = pd.DataFrame([[jointProbability]])
        testProbabilitiesByAuthor = testProbabilitiesByAuthor.append(output, ignore_index = True)
        
    ProbabilitiesByAuthor = pd.concat([ProbabilitiesByAuthor, testProbabilitiesByAuthor], axis=1)
    
    # and our winner is...
    winnerBySentence.append(testProbabilitiesByAuthor.idxmax()[0])

In [7]:
ProbabilitiesByAuthor = ProbabilitiesByAuthor.rename(index = ({0: 'EAP', 1:'HPL', 2: 'MWS'})).transpose()
ProbabilitiesByAuthor

Unnamed: 0,EAP,HPL,MWS
0,1.556514e-33,9.020108e-34,4.881226e-33
0,3.478730e-83,3.735469e-77,6.062002e-82
0,1.175920e-36,4.806128e-39,4.951351e-37
0,5.622411e-51,1.190190e-51,3.872460e-51
0,3.262915e-51,4.432366e-48,2.616982e-52
0,2.169763e-146,3.264619e-148,3.115660e-155
0,3.374671e-80,5.216576e-83,2.115305e-75
0,3.640671e-115,1.623041e-106,2.812996e-111
0,6.384096e-131,1.493511e-132,1.300575e-115
0,5.427950e-124,7.179013e-117,9.238810e-132


In [8]:
ProbabilitiesByAuthor = ProbabilitiesByAuthor.reset_index()
ProbabilitiesByAuthor = ProbabilitiesByAuthor.drop(['index'],axis= 1)
ProbabilitiesByAuthor

Unnamed: 0,EAP,HPL,MWS
0,1.556514e-33,9.020108e-34,4.881226e-33
1,3.478730e-83,3.735469e-77,6.062002e-82
2,1.175920e-36,4.806128e-39,4.951351e-37
3,5.622411e-51,1.190190e-51,3.872460e-51
4,3.262915e-51,4.432366e-48,2.616982e-52
5,2.169763e-146,3.264619e-148,3.115660e-155
6,3.374671e-80,5.216576e-83,2.115305e-75
7,3.640671e-115,1.623041e-106,2.812996e-111
8,6.384096e-131,1.493511e-132,1.300575e-115
9,5.427950e-124,7.179013e-117,9.238810e-132


In [9]:
ProbabilitiesByAuthor['id'] = pd.Series(test['id'])
cols = ['id', 'EAP', 'HPL', 'MWS']
ProbabilitiesByAuthor = ProbabilitiesByAuthor[cols]
ProbabilitiesByAuthor

Unnamed: 0,id,EAP,HPL,MWS
0,id22106,1.556514e-33,9.020108e-34,4.881226e-33
1,id08387,3.478730e-83,3.735469e-77,6.062002e-82
2,id23794,1.175920e-36,4.806128e-39,4.951351e-37
3,id16925,5.622411e-51,1.190190e-51,3.872460e-51
4,id17295,3.262915e-51,4.432366e-48,2.616982e-52
5,id10055,2.169763e-146,3.264619e-148,3.115660e-155
6,id18719,3.374671e-80,5.216576e-83,2.115305e-75
7,id12428,3.640671e-115,1.623041e-106,2.812996e-111
8,id12742,6.384096e-131,1.493511e-132,1.300575e-115
9,id27229,5.427950e-124,7.179013e-117,9.238810e-132


In [10]:
colsum = ProbabilitiesByAuthor.loc[:,['EAP', 'HPL', 'MWS']].sum(1)
colsum

0        7.339751e-33
1        3.735534e-77
2        1.675862e-36
3        1.068506e-50
4        4.435891e-48
5       2.202410e-146
6        2.115339e-75
7       1.623069e-106
8       1.300575e-115
9       7.179014e-117
10       9.966606e-36
11       2.751027e-61
12       4.314549e-44
13       1.440690e-93
14       1.718779e-36
15      5.319711e-146
16      3.825181e-225
17       1.350418e-49
18      1.420324e-143
19       1.616896e-33
20       4.693146e-87
21       1.926489e-68
22      1.379761e-139
23       3.071838e-30
24       3.696314e-45
25       7.326954e-82
26       3.191701e-26
27      3.889572e-100
28       7.643288e-62
29      8.978034e-140
            ...      
1970     7.296260e-67
1971     1.409046e-37
1972    1.447428e-116
1973     1.366731e-53
1974     5.828645e-98
1975     2.023971e-75
1976     1.398862e-80
1977    5.831934e-168
1978     1.351445e-32
1979    1.267612e-113
1980     8.245125e-80
1981     1.569022e-54
1982    4.306975e-169
1983     5.202840e-57
1984     3

In [11]:
ProbabilitiesScaled = ProbabilitiesByAuthor.copy()
ProbabilitiesScaled

Unnamed: 0,id,EAP,HPL,MWS
0,id22106,1.556514e-33,9.020108e-34,4.881226e-33
1,id08387,3.478730e-83,3.735469e-77,6.062002e-82
2,id23794,1.175920e-36,4.806128e-39,4.951351e-37
3,id16925,5.622411e-51,1.190190e-51,3.872460e-51
4,id17295,3.262915e-51,4.432366e-48,2.616982e-52
5,id10055,2.169763e-146,3.264619e-148,3.115660e-155
6,id18719,3.374671e-80,5.216576e-83,2.115305e-75
7,id12428,3.640671e-115,1.623041e-106,2.812996e-111
8,id12742,6.384096e-131,1.493511e-132,1.300575e-115
9,id27229,5.427950e-124,7.179013e-117,9.238810e-132


In [12]:
ProbabilitiesScaled['EAP'] = ProbabilitiesByAuthor['EAP']/colsum
ProbabilitiesScaled['HPL'] = ProbabilitiesByAuthor['HPL']/colsum
ProbabilitiesScaled['MWS'] = ProbabilitiesByAuthor['MWS']/colsum
ProbabilitiesScaled

Unnamed: 0,id,EAP,HPL,MWS
0,id22106,2.120663e-01,1.228939e-01,6.650397e-01
1,id08387,9.312539e-07,9.999828e-01,1.622794e-05
2,id23794,7.016811e-01,2.867855e-03,2.954510e-01
3,id16925,5.261936e-01,1.113882e-01,3.624181e-01
4,id17295,7.355716e-04,9.992054e-01,5.899563e-05
5,id10055,9.851771e-01,1.482294e-02,1.414660e-09
6,id18719,1.595334e-05,2.466071e-08,9.999840e-01
7,id12428,2.243079e-09,9.999827e-01,1.733134e-05
8,id12742,4.908673e-16,1.148347e-17,1.000000e+00
9,id27229,7.560857e-08,9.999999e-01,1.286919e-15


In [13]:
winnerBySentence

authorship = []

for a in winnerBySentence:
    if a == 0:
        authorship.append('EAP')
    elif a == 1:
        authorship.append('HPL')
    else:
        authorship.append('MWS')

In [14]:
ProbabilitiesScaled.to_csv('Data_Output/check_probabilities.csv', index = False)

In [15]:
test['author_assigned'] = authorship

test.to_csv('Data_Output/check_with_author.csv', index = False)