In [1]:
import import_ipynb #allows access to import other notebooks
from PreprocessLib import process_file_doc

importing Jupyter notebook from PreprocessLib.ipynb


In [2]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec
#loads models
clemModel = Word2Vec.load("clemModel.model")
lukeModel = Word2Vec.load("lukeModel.model")
pastModel = Word2Vec.load("pastModel.model")
paulModel = Word2Vec.load("paulModel.model")
testModel = Word2Vec.load("testModel.model")
doc2VecModel = Doc2Vec.load("doc2vec.model")

In [3]:
#takes in a W2V model and returns the associated vocabulary of the model
def intoDict (model): 
    # Get the ordered list of words in the vocabulary
    words = model.wv.vocab.keys()
    # Make a dictionary
    we_dict = {word:model.wv[word] for word in words}
    return we_dict

In [4]:
#takes a dictionary "dct", a list of words "wordList", and optionally a label
#takes each word vector of the words in wordList and adds them to an overall vector, then returns this vector
def createVec(dct, wordList, label="NA"):
    vec = []
    for word in wordList:
        vec.append(dct[word])
    if (label != "NA"):
        vec.append(label)
    return vec

In [5]:
#creating feature vectors of the models
clemDict = intoDict(clemModel)
lukeDict = intoDict(lukeModel)
pastDict = intoDict(pastModel)
paulDict = intoDict(paulModel)
testDict = intoDict(testModel)

commonWordList = ['θεοῦ', 'ἡμῶν', 'τὰς', 'ἵνα', 'αὐτοῦ', 'σου', 'κύριος', 'τοῦτο']

clemVec = createVec(clemDict, commonWordList, 'Clement')
lukeVec = createVec(lukeDict, commonWordList, 'Luke')
pastVec = createVec(pastDict, commonWordList, 'Pastoral')
paulVec = createVec(paulDict, commonWordList, 'Paul')
testVec = createVec(testDict, commonWordList)

In [6]:
import numpy as np
testFile = open("./finalTexts/Hebrews.txt", "r", encoding="utf8")
#prepares D2V for concatenation 
test = process_file_doc(testFile)
clemVec.append(doc2VecModel[0])
lukeVec.append(doc2VecModel[1])
pastVec.append(doc2VecModel[2])
paulVec.append(doc2VecModel[3])
docVecs = {}
docVecs['Test'] = doc2VecModel.infer_vector(test.split())
testXDoc = np.array(docVecs["Test"])
testVec.append(testXDoc)

In [7]:
#splits into training and testing sets
trainX = np.array((clemVec[:6], lukeVec[:6], pastVec[:6], paulVec[:6]))
trainY =["Clement", "Luke", "Pastoral", "Paul"]
testX = np.array(testVec[:6])

In [8]:
#reshapes data
nsamples, nx, ny = trainX.shape
trainXReshape = trainX.reshape((nsamples,nx*ny))

In [9]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=4000, hidden_layer_sizes=(4,)).fit(trainXReshape, trainY)

In [10]:
#MLP Results
testXReshape = testX.reshape(1,testX.shape[1]*6)
print(clf.classes_)
clf.predict_proba(testXReshape)

['Clement' 'Luke' 'Pastoral' 'Paul']


array([[0.10212029, 0.0048162 , 0.89151037, 0.00155308]], dtype=float32)

In [12]:
#Logisitic Regression results
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(trainXReshape, trainY)
print("Logistic Regression")
print(clf.classes_)
x = clf.predict_proba(testX.reshape(1, -1))
print(x)

Logistic Regression
['Clement' 'Luke' 'Pastoral' 'Paul']
[[0.35007648 0.11171991 0.3611005  0.17710311]]
