In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import eli5
import warnings
import numpy as np
warnings.filterwarnings("ignore")



In [2]:
BASE_DIR = "~/Documents/GitHub/bible_passage_prediction/dataset/"
df = pd.read_csv(BASE_DIR + "kjvdata.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,book,chapter,verse,text
0,0,1001001,Genesis,1,1,In the beginning God created the heaven and th...
1,1,1001002,Genesis,1,2,"And the earth was without form, and void; and ..."
2,2,1001003,Genesis,1,3,"And God said, Let there be light: and there wa..."
3,3,1001004,Genesis,1,4,"And God saw the light, that it was good: and G..."
4,4,1001005,Genesis,1,5,"And God called the light Day, and the darkness..."


In [4]:
df.columns
df['book'].unique()

array(['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy',
       'Joshua', 'Judges', 'Ruth', '1 Samuel (1 Kings)',
       '2 Samuel (2 Kings)', '1 Kings (3 Kings)', '2 Kings (4 Kings)',
       '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther',
       'Job', 'Psalms', 'Proverbs', 'Ecclesiastes',
       'Song of Solomon (Canticles)', 'Isaiah', 'Jeremiah',
       'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos',
       'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah',
       'Haggai', 'Zechariah', 'Malachi', 'Matthew', 'Mark', 'Luke',
       'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians',
       'Galatians', 'Ephesians', 'Philippians', 'Colossians',
       '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy',
       'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter',
       '1 John', '2 John', '3 John', 'Jude', 'Revelation'], dtype=object)

### Creating Mapping table of Authors 

In [5]:
author_list = {"Genesis": "Moses",
"Exodus": "Moses",
"Leviticus": "Moses",
"Numbers": "Moses",
"Deuteronomy": "Moses",
"Joshua": "Joshua",
"Judges": "Samuel, Nathan, Gad",
"Ruth": "Samuel, Nathan, Gad",
"1 Samuel (1 Kings)": "Samuel, Nathan, Gad",
"2 Samuel (2 Kings)": "Samuel, Nathan, Gad",
"1 Kings (3 Kings)": "Jeremiah",
"2 Kings (4 Kings)": "Jeremiah",
"1 Chronicles": "Ezra",
"2 Chronicles": "Ezra",
"Ezra": "Ezra",
"Nehemiah": "Nehemiah, Ezra",
"Esther": "Mordecai",
"Job": "Job,Moses",
"Psalms": "David,Asaph, Ezra, the sons of Korah, Heman, Ethan, Moses",
"Proverbs": "Solomon ,Agur(30) and Lemuel(31)",
"Ecclesiastes": "Solomon",
"Song of Solomon (Canticles)": "Solomon",
"Isaiah": "Isaiah",
"Jeremiah": "Jeremiah",
"Lamentations": "Jeremiah",
"Ezekiel": "Ezekiel",
"Daniel": "Daniel",
"Hosea": "Hosea",
"Joel": "Joel",
"Amos": "Amos",
"Obadiah": "Obadiah",
"Jonah": "Jonah",
"Micah": "Micah",
"Nahum": "Nahum",
"Habakkuk": "Habakkuk",
"Zephaniah": "Zephaniah",
"Haggai": "Haggai",
"Zechariah": " Zechariah",
"Malachi": "Malachi",
"Matthew": "Matthew",
"Mark": "John Mark",
"Luke": "Luke",
"John": "John, the Apostle",
"Acts": "Luke",
"Romans": "Paul",
"1 Corinthians": "Paul",
"2 Corinthians": "Paul",
"Galatians": "Paul",
"Ephesians": "Paul",
"Philippians": "Paul",
"Colossians": "Paul",
"1 Thessalonians": "Paul",
"2 Thessalonians": "Paul",
"1 Timothy": "Paul",
"2 Timothy": "Paul",
"Titus": "Paul",
"Philemon": "Paul",
"Hebrews": "Paul, Luke, Barnabas, Apollos",
"James": "James the brother of Jesus and Jude (not the Apostle, brother of John).",
"1 Peter": "Peter",
"2 Peter": "Peter",
"1 John": "John, the Apostle",
"2 John": "John, the Apostle",
"3 John": "John, the Apostle",
"Jude": "Jude, the brother of Jesus",
"Revelation": "John, the Apostle"}


In [6]:
df['author'] = df['book'].map(author_list)
df.head()

Unnamed: 0.1,Unnamed: 0,id,book,chapter,verse,text,author
0,0,1001001,Genesis,1,1,In the beginning God created the heaven and th...,Moses
1,1,1001002,Genesis,1,2,"And the earth was without form, and void; and ...",Moses
2,2,1001003,Genesis,1,3,"And God said, Let there be light: and there wa...",Moses
3,3,1001004,Genesis,1,4,"And God saw the light, that it was good: and G...",Moses
4,4,1001005,Genesis,1,5,"And God called the light Day, and the darkness...",Moses


### Separating Old testament and New Testament data

In [7]:
df2 = df
df2.loc[0:23144,'label'] = 0
df2.loc[23145:,'label'] = 1

### Using text as features to predict author name

In [8]:
Xfeatures = df['text']
ylabel = df['author']

In [9]:
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [10]:
bible_author_vectorizer = open("bible_author_vectorizer.pkl","wb")
joblib.dump(cv,bible_author_vectorizer)
bible_author_vectorizer.close()

In [11]:
X_train,X_test,y_train,y_test  = train_test_split(X,ylabel,test_size=0.33,random_state=42)

In [12]:
clf = MultinomialNB()
clf.fit(X_train,y_train)
print("Accuracy of Training",clf.score(X_train,y_train))
print("Accuracy of model",clf.score(X_test,y_test))

Accuracy of Training 0.6184557800278324
Accuracy of model 0.5213367108339828


In [13]:
bible_author_NV_model = open("bible_author_prediction_NV_model_new.pkl","wb")
joblib.dump(clf,bible_author_NV_model)
bible_author_NV_model.close()

In [14]:
logit = LogisticRegression()
logit.fit(X_train,y_train)
print("Model Accuracy:", accuracy_score(y_test,logit.predict(X_test)))

Model Accuracy: 0.5870031176929073


In [15]:
print("Accuracy of Training",logit.score(X_train,y_train))
print("Accuracy of model",logit.score(X_test,y_test))

Accuracy of Training 0.8532079274437353
Accuracy of model 0.5870031176929073


In [16]:
bible_author_Logit_model = open("bible_author_prediction_Logit_model_new.pkl","wb")
joblib.dump(logit,bible_author_Logit_model)
bible_author_Logit_model.close()

## Analyzing features and weights 

In [22]:
eli5.show_weights(logit)

AttributeError: 'HTML' object has no attribute 'format_as_html'

In [18]:
class_names = ['Moses', 'Joshua', 'Samuel, Nathan, Gad', 'Jeremiah', 'Ezra',
       'Nehemiah, Ezra', 'Mordecai', 'Job,Moses',
       'David,Asaph, Ezra, the sons of Korah, Heman, Ethan, Moses',
       'Solomon ,Agur(30) and Lemuel(31)', 'Solomon', 'Isaiah', 'Ezekiel',
       'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah',
       'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', ' Zechariah',
       'Malachi', 'Matthew', 'John Mark', 'Luke', 'John, the Apostle',
       'Paul', 'Paul, Luke, Barnabas, Apollos',
       'James the brother of Jesus and Jude (not the Apostle, brother of John).',
       'Peter', 'Jude, the brother of Jesus']

In [34]:
#This shows the keywords that most positively correlate to each Author 
obj = eli5.show_weights(logit,target_names=class_names, feature_names = cv.get_feature_names())
eli5.show_weights(logit, target_names=class_names, feature_names = cv.get_feature_names())

TypeError: explain_linear_classifier_weights() got an unexpected keyword argument 'filename'

In [20]:
X = cv.fit_transform(Xfeatures)

In [21]:
cv.get_feature_names()

['aaron',
 'aaronites',
 'abaddon',
 'abagtha',
 'abana',
 'abarim',
 'abase',
 'abased',
 'abasing',
 'abated',
 'abba',
 'abda',
 'abdeel',
 'abdi',
 'abdiel',
 'abdon',
 'abednego',
 'abel',
 'abelbethmaachah',
 'abelmaim',
 'abelmeholah',
 'abelmizraim',
 'abelshittim',
 'abez',
 'abhor',
 'abhorred',
 'abhorrest',
 'abhorreth',
 'abhorring',
 'abi',
 'abia',
 'abiah',
 'abialbon',
 'abiasaph',
 'abiathar',
 'abib',
 'abida',
 'abidah',
 'abidan',
 'abide',
 'abideth',
 'abiding',
 'abiel',
 'abiezer',
 'abiezrite',
 'abiezrites',
 'abigail',
 'abihail',
 'abihu',
 'abihud',
 'abijah',
 'abijam',
 'abilene',
 'ability',
 'abimael',
 'abimelech',
 'abinadab',
 'abinoam',
 'abiram',
 'abishag',
 'abishai',
 'abishalom',
 'abishua',
 'abishur',
 'abital',
 'abitub',
 'abiud',
 'abjects',
 'able',
 'abner',
 'aboard',
 'abode',
 'abodest',
 'abolish',
 'abolished',
 'abominable',
 'abominably',
 'abomination',
 'abominations',
 'abound',
 'abounded',
 'aboundeth',
 'abounding',
 'about

In [36]:
#?obj
obj.data
with open('results.html', 'w') as f:
    f.write(obj.data)
f.close()