In [7]:
import import_ipynb #allows access to import other notebooks
from PreprocessLib import cleanText, removeSingleLists, process_file, process_file_doc

In [2]:
#processes files for Doc2Vec
clemFile = open("./finalTexts/clement.txt", "r", encoding="utf8")
lukeFile = open("./finalTexts/luke.txt", "r", encoding="utf8")
pastFile = open("./finalTexts/pastoral.txt", "r", encoding="utf8")
paulFile = open("./finalTexts/paul.txt", "r", encoding="utf8")
testFile = open("./finalTexts/hebrews.txt", "r", encoding="utf8")
clem = process_file_doc(clemFile)
luke = process_file_doc(lukeFile)
past = process_file_doc(pastFile)
paul = process_file_doc(paulFile)
test = process_file_doc(testFile)

In [3]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
#places the texts in a list with tags for each author
docs = []
docs.append(TaggedDocument(words=clem, tags=["Clement"]))
docs.append(TaggedDocument(words=luke, tags=["Luke"]))
docs.append(TaggedDocument(words=past, tags=["Pastoral"]))
docs.append(TaggedDocument(words=paul, tags=["Paul"]))

In [5]:
#builds model and computes cosine similarity 20 times and returns average results 
def runIteration():
    avCl = 0
    avLu = 0
    avPa = 0
    avPu = 0
    for i in range(20):
        import gensim
        from scipy import spatial
        model = gensim.models.doc2vec.Doc2Vec(alpha=0.015, min_count=1, vector_size = 10, epochs=10, dm=0)
        model.build_vocab(docs)
        model.train(docs, total_examples=model.corpus_count, epochs=100)
        docVecs = {}
        docVecs['Test'] = model.infer_vector(test.split())
        res = model.docvecs.most_similar(positive=[docVecs['Test']])
        avCl += res[0][1]
        avLu += res[1][1]
        avPa += res[2][1]
        avPu += res[3][1]
    print(avCl/20)
    print(avLu/20)
    print(avPa/20)
    print(avPu/20)

In [6]:
runIteration()

0.9535793513059616
0.7890192091464996
0.7799270927906037
0.285723502933979


In [30]:
import gensim
from scipy import spatial
#builds model with given hyperparameters and infers vector iterations times
#retuns average result of each author
def runAnalysis(alph, DM, epoch, minCount, step, vectorSize, iterations):
    model = gensim.models.doc2vec.Doc2Vec(min_count=minCount, alpha=alph,
                epochs=epoch, dm=DM, vector_size=vectorSize)
    model.build_vocab(docs)
    model.train(docs, total_examples=model.corpus_count, epochs=1000)
    avCl = 0
    avLu = 0
    avPa = 0
    avPu = 0

    for i in range(iterations):
        docVecs = {}
        docVecs['Test'] = model.infer_vector(test.split(), alpha=alph, epochs=epoch, steps=step)
        res = model.docvecs.most_similar(positive=[docVecs['Test']])
        avCl += res[0][1]
        avLu += res[1][1]
        avPa += res[2][1]
        avPu += res[3][1]
    print(avCl/iterations)
    print(avLu/iterations)
    print(avPa/iterations)
    print(avPu/iterations)
   # model.save("doc2vec.model")

In [31]:
runAnalysis(0.015, 0, 10, 1, 10, 10, 20) #saved model for concat

0.7377914637327194
0.7246671885251998
0.7090859025716781
0.44735039919614794


In [32]:
runAnalysis(0.015, 0, 10, 1, 20, 10, 20)

0.7346398890018463
0.7234736919403076
0.7132363736629486
0.4496680721640587


In [33]:
runAnalysis(0.015, 0, 10, 1, 20, 100, 20)

0.7535357385873794
0.7363335102796554
0.6885487884283066
0.37157949060201645


In [19]:
import numpy as np
from sklearn.neural_network import MLPClassifier
#builds model and infers vector
#returns average of LogReg, linearSVC, and MLPClassifier results per author
def runMLAnalysis():
    avCl1 = 0
    avLu1 = 0
    avPa1 = 0
    avPu1 = 0
    avCl2 = 0
    avLu2 = 0
    avPa2 = 0
    avPu2 = 0
    avCl3 = 0
    avLu3 = 0
    avPa3 = 0
    avPu3 = 0
    iterations = 40

    for i in range(iterations):
        model = gensim.models.doc2vec.Doc2Vec()
        model.build_vocab(docs)
        model.train(docs, total_examples=model.corpus_count, epochs=300)
        trainX = np.array([model.docvecs[0], model.docvecs[1], model.docvecs[2], model.docvecs[3]])
        trainY = ["Clement", "Luke", "Pastoral", "Paul"]
        docVecs = {}
        docVecs['Test'] = model.infer_vector(test.split())
        testX = np.array(docVecs["Test"])
        clf = MLPClassifier(max_iter=8000, hidden_layer_sizes=(4,)).fit(trainX, trainY)
        x = clf.predict_proba(testX.reshape(1, -1))
        print("MLP")
        print(clf.classes_)
        print(x)
        avCl1 += x[0][0]
        avLu1 += x[0][1]
        avPa1 += x[0][2]
        avPu1 += x[0][3]
        from sklearn.linear_model import LogisticRegression
        clf = LogisticRegression()
        clf.fit(trainX, trainY)
        print("Logistic Regression")
        print(clf.classes_)
        x = clf.predict_proba(testX.reshape(1, -1))
        print(x)
        avCl2 += x[0][0]
        avLu2 += x[0][1]
        avPa2 += x[0][2]
        avPu2 += x[0][3]

        from sklearn.svm import LinearSVC
        clf = LinearSVC()
        clf.fit(trainX, trainY)
        x = clf.predict(testX.reshape(1,-1))
        print("LinearSVC")
        print(x[0])
        if (x[0] == "Clement"):
            avCl3 += 1
        elif (x[0] == "Luke"):
            avLu3 += 1
        elif (x[0] == "Pastoral"):
            avPa3 += 1
        elif (x[0] == "Paul"):
            avPu3 += 1
        print("----------------------------")

    print("Averages MLP: ", avCl1/iterations, avLu1/iterations, avPa1/iterations, avPu1/iterations)
    print("Averages LogReg: ", avCl2/iterations, avLu2/iterations, avPa2/iterations, avPu2/iterations)
    print("SVC Accuracy: ", avCl3/iterations, avLu3/iterations, avPa3/iterations, avPu3/iterations)

In [20]:
#results for using entire document as vector
runMLAnalysis()

MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[1.5561669e-03 9.9825567e-01 3.4474002e-08 1.8810559e-04]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.03072022 0.17671939 0.08244155 0.71011884]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[2.5006371e-02 3.5128101e-05 2.3202098e-05 9.7493523e-01]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.02988797 0.1595656  0.05043158 0.76011485]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[8.2892576e-10 5.7428000e-09 1.0000000e+00 2.0011221e-08]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.03210174 0.16307837 0.08143258 0.72338731]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[8.4093163e-06 1.2473128e-09 9.9999154e-01 4.3681403e-08]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.03174078 0.17378524 0.08415551 0.71031847]]
LinearSVC
Paul
----------------------------


MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[2.2260990e-04 9.4564664e-01 8.3473790e-03 4.5783311e-02]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.02176374 0.26578705 0.08229147 0.63015773]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[1.7496401e-03 8.4465668e-03 2.1857767e-10 9.8980379e-01]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.04450193 0.17262441 0.08102394 0.70184972]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.0676578  0.41832736 0.14983617 0.36417863]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.02382359 0.13825599 0.07962846 0.75829197]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[2.3713554e-03 6.5257278e-05 9.9753463e-01 2.8798835e-05]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.03584606 0.19085759 0.07915643 0.69413992]]
LinearSVC
Paul
----------------------------
MLP
['Clemen

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
#takes in a text "txt", a label "label", D2V docs, and a number of sentences "sentenceNum"
#returns the text in split-up format of sentenceNum sentences.
def sliceCorpustoPara(txt, label, docs, sentenceNum):
    tmpList = ""
    finalList = []
    j = 0
    for i in range(len(txt)):
        tmpList += txt[i] + " "
        if (j == sentenceNum):
            finalList.append(tmpList)
            docs.append(TaggedDocument(words=tmpList, tags=[label]))
            tmpList = ""
            j = 0
        j += 1
    if len(tmpList):
        docs.append(TaggedDocument(words=tmpList, tags=[label]))
        finalList.append(tmpList)
    return finalList

In [None]:
#returns the file as a long string
def stringify(a_file):
    string_without_line_breaks = ""
    for line in a_file:
        stripped_line = line.rstrip() + " "
        string_without_line_breaks += stripped_line
    a_file.close()
    sentences = string_without_line_breaks
    return sentences

clemFile = open("./finalTexts/clement.txt", "r", encoding="utf8")
lukeFile = open("./finalTexts/luke.txt", "r", encoding="utf8")
pastFile = open("./finalTexts/pastoral.txt", "r", encoding="utf8")
paulFile = open("./finalTexts/paul.txt", "r", encoding="utf8")
testFile = open("./finalTexts/hebrews.txt", "r", encoding="utf8")

clem = stringify(clemFile)
luke = stringify(lukeFile)
past = stringify(pastFile)
paul = stringify(paulFile)
test = stringify(testFile)
from nltk import tokenize
#sentence tokenizes the text
clemSent = tokenize.sent_tokenize(clem)
lukeSent = tokenize.sent_tokenize(luke)
pastSent = tokenize.sent_tokenize(past)
paulSent = tokenize.sent_tokenize(paul)
testSent = tokenize.sent_tokenize(test)
#prepares text for paragraph level D2V
docs=[]
sliceCorpustoPara(clemSent, 'Clement', docs, 5)
sliceCorpustoPara(lukeSent, 'Luke', docs, 5)
sliceCorpustoPara(pastSent, 'Paul', docs, 5)
sliceCorpustoPara(paulSent, 'Pastoral', docs, 5)
tmp = []
testPara = sliceCorpustoPara(testSent, "Test", tmp, 5)

In [21]:
#results for using paragraph vectors
runMLAnalysis()

MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[3.0592497e-11 1.6456465e-06 4.2400482e-01 5.7599360e-01]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.02506822 0.13220866 0.07404374 0.76867937]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[3.2546414e-08 1.0134559e-05 9.7070777e-01 2.9282080e-02]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.02228136 0.19826898 0.11138567 0.668064  ]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[9.041655e-03 3.841118e-06 3.179148e-03 9.877753e-01]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.03248658 0.15421763 0.08685301 0.72644277]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[1.5126250e-04 9.9849892e-01 1.3660445e-05 1.3360759e-03]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.02966456 0.1814123  0.10804194 0.6808812 ]]
LinearSVC
Paul
----------------------------
MLP


MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[9.6309304e-01 4.7787607e-06 2.2697785e-13 3.6902126e-02]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.02472175 0.08990886 0.05327177 0.83209761]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.691042   0.10914396 0.09572291 0.10409116]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.02406948 0.17510449 0.0484117  0.75241433]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[5.1386887e-04 1.5106522e-01 1.6007487e-01 6.8834609e-01]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.0206741  0.08875201 0.0380557  0.85251819]]
LinearSVC
Paul
----------------------------
MLP
['Clement' 'Luke' 'Pastoral' 'Paul']
[[2.4005644e-09 8.7765658e-01 7.3575023e-08 1.2234338e-01]]
Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.05363413 0.19840056 0.11086305 0.63710227]]
LinearSVC
Paul
----------------------------
MLP
['Clemen