# General Index Data Framework
General index data input using a test dataset of ~750 manuscripts
across 2 topics: Antediluvian and Hennig86. 

---
Created 6/3/22 by Ian Hay   
Updated 7/23/22

### Imports
Dependencies

---
[Python 3.8+](https://www.python.org/downloads/release/python-380/)    
[Pandas](https://pandas.pydata.org/)   
[NumPy](https://numpy.org/)    
[scikit-learn](https://scikit-learn.org/stable/index.html)  
[nltk](https://www.nltk.org/)   
[spacy](https://pypi.org/project/spacy/)    
[matplotlib](https://matplotlib.org/)   

In [2]:
"""
Run this line in your python environment before running the code below:

conda install -c anaconda numpy scipy pandas nltk ssl spacy matplotlib scikit-learn

"""
import pandas as pd
import numpy as np
import nltk
import ssl
import spacy
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib.pyplot import figure
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
"""
Run this line in your python environment before running the code below: 

python -m spacy download en_core_web_sm

"""
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /Users/ian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Functions

In [4]:
# Data Processing -------------------------------------------------------------------------------------------

def getDocColumn(df, column, newColumnName):
    """
    Given a dataframe and a column of datatype list, constructs
    a new column newColumnName with the list joined into a single
    string and items separated by spaces (" ").
    """
    strDict = {}
    for row in range(len(df)):
        text = ". ".join(df.iloc[row][column])
        text = text + "."
        strDict[df.index[row]] = text
    dfStr = pd.Series(strDict, name=newColumnName)
    df[newColumnName] = dfStr

def getBagOfWordsDF(df, docColumn):
    """
    Uses SKLearn's CountVectorizer to build a Bag-of-Words DataFrame
    for the given column. The column datatype should be formatted as 
    a document, where each document is a single string object.
    """
    countvec = CountVectorizer()
    X = countvec.fit_transform(df[docColumn])
    dfBOW = pd.DataFrame(X.toarray(),columns=countvec.get_feature_names_out())
    dfBOW.set_index(df.index, inplace=True)
    return dfBOW

def partOfSpeechTagging(texts, POS=["NOUN", "ADJ", "ADV", "VERB"]):
    """
    Utilizes Spacy to extract nouns and verbs from ngrams
    and build a new column with only these terms.
    https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#16buildingldamalletmodel
    """
    textTagged = []
    for text in texts:
        doc = nlp(text)
        text_ = [token.lemma_ for token in doc if token.pos_ in POS]
        textTagged.append(" ".join(text_))
    return textTagged

def removeStopWords(texts):
    """
    Takes in texts (words in a single string separated by spaces).
    https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
    """
    textParsed = []
    from nltk.corpus import stopwords
    _stopWords = stopwords.words("english")
    _stopWords.extend(["-pron-", "pron"]) # these words appears in many ngrams without apparent meaning
    textParsed = texts.apply(lambda x: " ".join([word for word in x.split() if word not in (list(_stopWords))]))
    return textParsed

def buildAdjacencyMatrixByCoOccurence(dfBOW):
    """
    """
    X = np.array(dfBOW.values)
    coocc = np.dot(X.T, X)
    np.fill_diagonal(coocc, 0)
    return coocc

In [5]:
# Utility --------------------------------------------------------------------------------------------------

def loadTextFileIntoDataframe(filepath, columns, splittingChar="\t"):
    """
    Opens the given filepath into a pandas dataframe.
    Splits the list by the denoted character, by default tab.
    Returns a pandas dataframe.
    """
    with open(filepath) as file:
        data = file.readlines()
    df = pd.DataFrame(columns=columns)
    for line in data:
        lineSplit = [line.split(splittingChar)]
        if len(lineSplit[0]) < len(columns):
            for x in range(len(columns) - len(lineSplit[0])):
                lineSplit[0].append("0")
        lineDF = pd.DataFrame(lineSplit, columns=columns)
        df = pd.concat([df, lineDF], axis=0, ignore_index=True)
    return df

def standardizeDataColumn(df, column, newColumnName):
    """
    Standardizes the column of the dataframe df.
    Adds the new column newColumnName to the dataframe inplace.
    Mean is 0, variance is 1
    """
    data = df[column]
    mean = np.mean(data)
    std = np.std(data)
    zScore = []
    for x in range(len(df.index)):
        zScore.append((data[x] - mean) / std)
    df[newColumnName] = zScore


### Building Dataframe and Processing Data

In [6]:
# hard coded things
columnListNGrams = ["hash", "ngram", "ngram_lc", "ngram_tokens", "ngram_count", "term_freq", "doc_count", "date_added"]
columnListKeywords = ["hash", "keywords", "keywords_lc", "keyword_tokens", "keyword_score", "doc_count", "insert_date"]

non_words = ["a", "at", "an", "am", "and", "that", "like", "for", "by", "i", "in", "of", "or", "be", "use", "as", "on", "the", "to", "with", "-pron-"]

filenameAnteNGrams = "data/doc_ngrams/sample.fgrep.antediluvian.txt"
filepathHennigNGrams = "data/doc_ngrams/sample.fgrep.Hennig86.txt"
filenameAnteKeywords = "data/doc_keywords/sample.fgrep.antediluvian.txt"

In [7]:
"""
# load test files into dataframe (may take up to 1 minute)

indexCol = columnListNGrams[0]
df_antedivulian = loadTextFileIntoDataframe(filepath=filenameAnteNGrams, columns=columnListNGrams)
df_hennig = loadTextFileIntoDataframe(filepath=filepathHennigNGrams, columns=columnListNGrams)
df_antedivulian["topic"] = "antediluvian"
df_hennig["topic"] = "hennig86"
df = pd.concat([df_antedivulian, df_hennig])

# save the data for later use
df.to_csv("test_data.csv")

# process data columns
df["ngram_lc_tagged"] = partOfSpeechTagging(df["ngram_lc"])
df["ngram_lc_tagged"] = removeStopWords(df["ngram_lc_tagged"])
df = df.groupby(indexCol).agg(list)

yakeScoreCol = columnListNGrams[5]
for n in range(len(df)):
    term_freq_list = df[yakeScoreCol].iloc[n]
    df[yakeScoreCol].iloc[n] = term_freq_list[0]
    df["topic"].iloc[n] = df["topic"].iloc[n][0]
    if (len(df["ngram_lc_tagged"].iloc[n])) == 0: # if there are no noun/verb phrases from spacy preprocessing
        df["ngram_lc_tagged"][n] = df["ngram_lc"].iloc[n] # replace it with the lowercase ngram(s)
df[yakeScoreCol] = df[yakeScoreCol].astype(float)

standardizeDataColumn(df, yakeScoreCol, "standardized_term_freq")
getDocColumn(df, "ngram_lc_tagged", "ngram_lc_tagged_doc")
getDocColumn(df, "ngram_lc", "ngram_lc_doc")
df["topic_num"] = (df["topic"] == "antediluvian").astype(int)

# save the dataframe for later use
df.to_csv("test_data_processed.csv")

df.head()
"""

'\n# load test files into dataframe (may take up to 1 minute)\n\nindexCol = columnListNGrams[0]\ndf_antedivulian = loadTextFileIntoDataframe(filepath=filenameAnteNGrams, columns=columnListNGrams)\ndf_hennig = loadTextFileIntoDataframe(filepath=filepathHennigNGrams, columns=columnListNGrams)\ndf_antedivulian["topic"] = "antediluvian"\ndf_hennig["topic"] = "hennig86"\ndf = pd.concat([df_antedivulian, df_hennig])\n\n# save the data for later use\ndf.to_csv("test_data.csv")\n\n# process data columns\ndf["ngram_lc_tagged"] = partOfSpeechTagging(df["ngram_lc"])\ndf["ngram_lc_tagged"] = removeStopWords(df["ngram_lc_tagged"])\ndf = df.groupby(indexCol).agg(list)\n\nyakeScoreCol = columnListNGrams[5]\nfor n in range(len(df)):\n    term_freq_list = df[yakeScoreCol].iloc[n]\n    df[yakeScoreCol].iloc[n] = term_freq_list[0]\n    df["topic"].iloc[n] = df["topic"].iloc[n][0]\n    if (len(df["ngram_lc_tagged"].iloc[n])) == 0: # if there are no noun/verb phrases from spacy preprocessing\n        df["n

In [36]:
# read the processed data CSV
df = pd.read_csv("test_data_processed.csv")
df.set_index("hash", inplace=True)
# https://stackoverflow.com/questions/63200863/how-to-convert-the-datatype-of-a-column-from-string-to-list-in-pandas
df['ngram'] = df['ngram'].apply(lambda cell:
                                      ''.join(c for c in cell if c not in "'[]").split(', '))
df['ngram_lc'] = df['ngram_lc'].apply(lambda cell:
                                      ''.join(c for c in cell if c not in "'[]").split(', '))
df['ngram_lc_tagged'] = df['ngram_lc_tagged'].apply(lambda cell:
                                      ''.join(c for c in cell if c not in "'[]").split(', '))
df.head()

Unnamed: 0_level_0,ngram,ngram_lc,ngram_tokens,ngram_count,term_freq,doc_count,date_added,topic,ngram_lc_tagged,standardized_term_freq,ngram_lc_tagged_doc,ngram_lc_doc,topic_num
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3002e8a37ec9d00a67bdf0004b8628c35d72068d,"[antediluvian, antediluvian humanity]","[antediluvian, antediluvian humanity]","['1', '2']","['1', '1']",1e-05,"['1', '1']","['\\N\n', '\\N\n']",antediluvian,"[antediluvian, antediluvian humanity]",-0.654965,antediluvian. antediluvian humanity.,antediluvian. antediluvian humanity.,1
3005b3bf055ddcb3c25e4742a72ee16728934efd,"[antediluvian, antediluvian refrain, follow by...","[antediluvian, antediluvian refrain, follow by...","['1', '2', '4', '5']","['1', '1', '1', '1']",0.000281,"['1', '1', '1', '1']","['\\N\n', '\\N\n', '\\N\n', '\\N\n']",antediluvian,"[antediluvian, antediluvian refrain, follow an...",-0.004052,antediluvian. antediluvian refrain. follow ant...,antediluvian. antediluvian refrain. follow by ...,1
3005ebfe5508340797dbfcce8454f3d3f6f76eb1,"[antediluvian, antediluvian dream, cave of -PR...","[antediluvian, antediluvian dream, cave of -pr...","['1', '2', '4', '5', '5']","['1', '1', '1', '1', '1']",9.1e-05,"['1', '1', '1', '1', '1']","['2021-09-03\n', '2021-09-03\n', '2021-09-03\n...",antediluvian,"[antediluvian, antediluvian dream, cave antedi...",-0.459385,antediluvian. antediluvian dream. cave antedil...,antediluvian. antediluvian dream. cave of -pro...,1
30064ae161de1e9a96992be108c195796f13e72a,"[Hennig86 program, routine in the Hennig86, ro...","[hennig86 program, routine in the hennig86, ro...","['2', '4', '5', '1']","['1', '1', '1', '1']",0.000198,"['1', '1', '1', '1']","['\\N\n', '\\N\n', '\\N\n', '\\N\n']",hennig86,"[hennig86 program, routine hennig86, routine h...",-0.202622,hennig86 program. routine hennig86. routine he...,hennig86 program. routine in the hennig86. rou...,0
30136ab3788ab8e8be6b939901ec669a41ef896a,[antediluvian],[antediluvian],['1'],['1'],5.1e-05,['1'],['\\N\n'],antediluvian,[antediluvian],-0.556359,antediluvian.,antediluvian.,1


In [16]:
# creates a list of all words that appear in the dataset

from itertools import chain
masterTermList = []
for eachList in df["ngram_lc"].values:
    splitList = []
    for string in eachList:
        splitList.append(string.split(" "))
    masterTermList.append(splitList)
firstNest = list(chain(*masterTermList))
secondNest = list(chain(*firstNest))
masterTerms = list(dict.fromkeys(secondNest))

In [17]:
# count number of times each word appears in an ngram
dfBoW = getBagOfWordsDF(df, "ngram_lc_doc")
dfBoWTag = getBagOfWordsDF(df, "ngram_lc_tagged_doc")
wordCountDict = {}
wordCountDictTag = {}
for column in dfBoW.columns:
    wordCountDict[column] = np.sum(dfBoW[column])
for column in dfBoWTag.columns:
    wordCountDictTag[column] = np.sum(dfBoWTag[column])

In [59]:
# count number of times each ngram appears in the corpus
df_raw = pd.read_csv("test_data.csv")
ngramDict = {}
for n in range(len(df_raw.index)):
    ngram = df_raw["ngram_lc"][n]
    if ngram in list(ngramDict.keys()):
        ngramDict[ngram] = ngramDict[ngram] + 1
    else:
        ngramDict[ngram] = 1

In [64]:
# count number of times ngrams of specified lengths appear
bigramDict = dict((k, ngramDict[k]) for k in list(ngramDict.keys()) if len(k.split(" ")) == 2)
trigramDict = dict((k, ngramDict[k]) for k in list(ngramDict.keys()) if len(k.split(" ")) == 3)
fourgramDict = dict((k, ngramDict[k]) for k in list(ngramDict.keys()) if len(k.split(" ")) == 4)
fivegramDict = dict((k, ngramDict[k]) for k in list(ngramDict.keys()) if len(k.split(" ")) == 5)

### Data Visualization

In [43]:
# histogram of word counts
from collections import OrderedDict
# https://stackoverflow.com/questions/20577840/python-dictionary-sorting-in-descending-order-based-on-values
d_descending = OrderedDict(sorted(wordCountDict.items(), key=lambda kv: kv[1], reverse=True))
fig = px.bar(d_descending.keys(), y=d_descending.values(), log_y = True, title='Counts of top words', template='plotly_white')
fig.show()

In [44]:
# histogram of top tagged word counts
# https://stackoverflow.com/questions/20577840/python-dictionary-sorting-in-descending-order-based-on-values
d_descending_tagged = OrderedDict(sorted(wordCountDictTag.items(), key=lambda kv: kv[1], reverse=True))
fig = px.bar(d_descending_tagged.keys(), y=d_descending_tagged.values(), log_y = True, title='Counts of top tagged words', template='plotly_white')
fig.show()

In [35]:
# histogram of top N words
N = 20
topNWords = list(d_descending)[0:N]
topNValues = list(d_descending.values())[0:N]
fig = px.bar(x=topNWords, y=topNValues, log_y = False, title='Counts of top ' +  str(N) + ' words', template='plotly_white')
fig.show()

In [45]:
# histogram of top N tagged words
N = 20
topNWords = list(d_descending_tagged)[0:N]
topNValues = list(d_descending_tagged.values())[0:N]
fig = px.bar(x=topNWords, y=topNValues, log_y = False, title='Counts of top ' +  str(N) + ' tagged words', template='plotly_white')
fig.show()

In [19]:
# histogram of term_freq
termFreqDict = dict(df["term_freq"])
termFreq_descending = OrderedDict(sorted(termFreqDict.items(), key=lambda kv: kv[1], reverse=True))
fig = px.bar(y=termFreq_descending.values(), log_y = True, title='Counts of ngram term frequeny in underlying manuscript', template='plotly_white')
fig.show()

In [20]:
# histogram of number of ngrams per document

lenDict = {}
for x in range(len(df.index)):
    lenDict[df.index[x]] = len(df["ngram_lc_tagged"][x])
lenDict_descending = OrderedDict(sorted(lenDict.items(), key=lambda kv: kv[1], reverse=True))
fig = px.bar(y=lenDict_descending.values(), log_y = True, title='Counts of ngrams associated with a particular manuscript', template='plotly_white')
fig.show()

In [69]:
# plot top N bigrams
N = 20
bigramDict_descending = OrderedDict(sorted(bigramDict.items(), key = lambda kv: kv[1], reverse=True))
topNBigrams = list(bigramDict_descending)[0:N]
topNBigramValues = list(bigramDict_descending.values())[0:N]
fig = px.bar(x=topNBigrams, y=topNBigramValues, log_y = False, title='Counts of top ' + str(N) + ' bigrams associated with a particular manuscript', template='plotly_white')
fig.show()

In [71]:
# plot top N trigrams
N = 10
trigramDict_descending = OrderedDict(sorted(trigramDict.items(), key = lambda kv: kv[1], reverse=True))
topNTrigrams = list(trigramDict_descending)[0:N]
topNTrigramValues = list(trigramDict_descending.values())[0:N]
fig = px.bar(x=topNTrigrams, y=topNTrigramValues, log_y = False, title='Counts of top ' + str(N) + ' trigrams associated with a particular manuscript', template='plotly_white')
fig.show()

In [72]:
# plot top N 4grams
N = 10
fourgramDict_descending = OrderedDict(sorted(fourgramDict.items(), key = lambda kv: kv[1], reverse=True))
topNFourgrams = list(fourgramDict_descending)[0:N]
topNFourgramValues = list(fourgramDict_descending.values())[0:N]
fig = px.bar(x=topNFourgrams, y=topNFourgramValues, log_y = False, title='Counts of top ' + str(N) + ' 4grams associated with a particular manuscript', template='plotly_white')
fig.show()

In [73]:
# plot top N 5grams
N = 10
fivegramDict_descending = OrderedDict(sorted(fivegramDict.items(), key = lambda kv: kv[1], reverse=True))
topNFivegrams = list(fivegramDict_descending)[0:N]
topNFivegramValues = list(fivegramDict_descending.values())[0:N]
fig = px.bar(x=topNFivegrams, y=topNFivegramValues, log_y = False, title='Counts of top ' + str(N) + ' 5grams associated with a particular manuscript', template='plotly_white')
fig.show()