In [1]:
import import_ipynb #allows access to import other notebooks
from PreprocessLib import cleanText
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec

importing Jupyter notebook from PreprocessLib.ipynb


In [2]:
#loads models
longModel = Word2Vec.load("longModel.model")
markModel = Word2Vec.load("markModel.model")
mathModel = Word2Vec.load("mathModel.model")
ignatModel = Word2Vec.load("ignatModel.model")
testModel = Word2Vec.load("testModel.model")
doc2VecModel = Doc2Vec.load("doc2vec.model")

In [3]:
#takes in a W2V model and returns the associated vocabulary of the model
def intoDict (model): 
    # Get the ordered list of words in the vocabulary
    words = model.wv.vocab.keys()
    # Make a dictionary
    we_dict = {word:model.wv[word] for word in words}
    return we_dict

In [4]:
#takes a dictionary "dct", a list of words "wordList", and optionally a label
#takes each word vector of the words in wordList and adds them to an overall vector, then returns this vector
def createVec(dct, wordList, label="NA"):
    vec = []
    for word in wordList:
        vec.append(dct[word])
    if (label != "NA"):
        vec.append(label)
    return vec

In [5]:
#gets all similar words in the documents
def getAllSimWords():
    simWords = []
    for word in testDict:
        if (word in longDict and word in markDict and word in mathDict and word in ignatDict):
            simWords.append(word)
    return simWords

In [6]:
#creating feature vectors of the models
longDict = intoDict(longModel)
markDict = intoDict(markModel)
mathDict = intoDict(mathModel)
ignatDict = intoDict(ignatModel)
testDict = intoDict(testModel)
    #these words were selected based on results from FrequencyAnalysis.ipynb
commonWordList = getAllSimWords()
longVec = createVec(longDict, commonWordList, 'Longus')
markVec = createVec(markDict, commonWordList, 'Mark')
mathVec = createVec(mathDict, commonWordList, 'Mathew')
ignatVec = createVec(ignatDict, commonWordList, 'Ignatius')
testVec = createVec(testDict, commonWordList)

In [7]:
def process_file_doc(a_file):
    string_without_line_breaks = ""
    for line in a_file:
        stripped_line = line.rstrip() + " "
        string_without_line_breaks += stripped_line
    a_file.close()
    sentences = string_without_line_breaks

    cleanedFile = cleanText(sentences, False) #cleanText is found in PreprocessLib.ipynb
    return cleanedFile

In [8]:
import numpy as np
testFile = open("./other-sources/Ignatius-3books.txt", "r", encoding="utf8")
test = process_file_doc(testFile)
longVec.append(doc2VecModel[0])
markVec.append(doc2VecModel[1])
mathVec.append(doc2VecModel[2])
ignatVec.append(doc2VecModel[3])
docVecs = {}
docVecs['Test'] = doc2VecModel.infer_vector(test.split())
testXDoc = np.array(docVecs["Test"])
testVec.append(testXDoc)

In [9]:
import numpy as np
trainX = np.array((longVec[:len(commonWordList)], markVec[:len(commonWordList)], mathVec[:len(commonWordList)],
                   ignatVec[:len(commonWordList)]))
trainY =['Longus', 'Mark', "Mathew", "Ignatius"]
testX = np.array(testVec[:len(commonWordList)])

In [10]:
#returns average result of MLP classifier and LogisticRegression classifier iterations times
def runIteration(iterations):
    av = 0
    av2 = 0
    for i in range(0, iterations):
        nsamples, nx, ny = trainX.shape
        trainXReshape = trainX.reshape((nsamples,nx*ny))
        from sklearn.neural_network import MLPClassifier
        clf = MLPClassifier(max_iter=8000, hidden_layer_sizes=(4,)).fit(trainXReshape, trainY)
        testXReshape = testX.reshape(1,testX.shape[1]*len(commonWordList))
        av += clf.predict_proba(testXReshape)[0][0]
        from sklearn.linear_model import LogisticRegression
        clf = LogisticRegression()
        clf.fit(trainXReshape, trainY)
        clf.predict_proba(testX.reshape(1, -1))
        av2 += clf.predict_proba(testXReshape)[0][0]
    print("Average MLP: ", av/iterations)
    print("Average Log: ", av2/iterations)

In [15]:
#vector size 5
runIteration(20)

Average MLP:  0.8718874454498291
Average Log:  0.8663758617054744


In [11]:
#vector size 100
runIteration(20)

Average MLP:  0.9130725175142288
Average Log:  0.2956156591941414
