# General Index Unit Test
Small scale testing of general index data & processing

---
Created 6/3/22 by Ian Hay

### Imports

In [3]:
from time import time
import pandas as pd
import numpy as np
import itertools
import networkx as nx
from pyvis.network import Network

### Functions

In [4]:
# Utility

def loadTextFileIntoDataframe(filepath, splittingChar="\t"):
    """
    Opens the given filepath into a pandas dataframe.
    Splits the list by the denoted character, by default tab.
    """
    with open(filepath) as file:
        data = file.readlines()
    df = pd.DataFrame()
    for line in data:
        lineSplit = line.split(splittingChar)
        df = df.append([lineSplit])
    return df

def numSimilarStrings(stringList1, stringList2):
    """
    Given two lists of strings, returns the number of strings they both share.
    In other words, the size of the subset intersection of stringList1 and stringList2.
    """
    count = 0
    for string in stringList1:
        if string in stringList2:
            count = count + 1
    return count

def getUniqueWordsColumn(df, column, newColumnName, nonWords=[]):
    """
    Given a dataframe and column, constructs a new column with name newColumnName
    of the unique words in  df[column]
    The object in  df[column]  must be a list of strings
    """
    df[newColumnName] = df[column]
    for row in range(len(df[newColumnName])):
        df[newColumnName][row] = df[column].iloc[row]
        string_list = []
        for string in df[newColumnName].iloc[row]:
            string_list.append(string.split(" "))
        string_list = list(itertools.chain(*string_list)) # concatenates nested list into 1D list
        string_list = list(set(string_list)) # grabs only unique string items
        for nonword in nonWords:
            if nonword in string_list:
                string_list.remove(nonword)
        df[newColumnName].iloc[row] = string_list
    return df


# performance building adjMatrix on combined test set:
# * time for 100 iterations:  430.5321 seconds
# *     time per iteration: 4.305321 seconds
#
# *: timed on 16-thread CPU
def buildAdjacencyMatrixByColumn(df, column):
    """
    Given a dataframe and a column, constructs an adjacency matrix
    of size [n x n] where  n  is the number of rows of the dataframe.
    The adjacency matrix represents the number of alike elements.
    The object in  df[column]  must be a list of values.
    """
    n = len(df[column])
    adjMatrix = np.zeros((n, n))
    for n1 in range(n):
        ngram1 = df[column].iloc[n1]
        for n2 in range(n):
            ngram2 = df[column].iloc[n2]
            numSimilar = numSimilarStrings(ngram1, ngram2)
            adjMatrix[n1][n2] = numSimilar
            if n1 == n2:
                adjMatrix[n1][n2] = 0 # removes recursive edges
            if numSimilar == 1:
                adjMatrix[n1][n2] = 0 # removes edges with 1 similarity to reduce complexity
    return adjMatrix


def visualizeNetworkHTML(adjMatrix, filename):
    """
    Given an adjacency matrix and the filename to save to, builds an HTML
    graph of that network.
    Uses pyvis to build an interactive HTML graph of a network from its adjacency matrix.
    Uses NetworkX to work with adjacency matrix.
    """
    G = nx.from_numpy_matrix(adjMatrix)
    net = Network(width="1920px", height="1080px", notebook=True)
    net.barnes_hut()
    net.from_nx(G)
    net.show(filename)

### Building Dataframe

In [5]:
# hard coded things
columnDict = {0: "hash", 1: "ngram", 2: "ngram_lc", 3: "ngram_num_tokens", 4: "ngram_count", 5: "term_freq", 6: "doc_count"}
non_words = ["a", "at", "an", "am", "and", "that", "like", "for", "by", "i", "in", "of", "or", "be", "use", "as", "on", "the", "to", "with", "-pron-"]

In [6]:
# data in "data" folder derived from: 
filenameAnte = "data/doc_ngrams/sample.fgrep.antediluvian.txt"
filepathHennig = "data/doc_ngrams/sample.fgrep.Hennig86.txt"

df_antedivulian = loadTextFileIntoDataframe(filepath=filenameAnte)
df_hennig = loadTextFileIntoDataframe(filepath=filepathHennig)
df = pd.concat([df_antedivulian, df_hennig])
df.drop(7, axis=1, inplace=True)
df.rename(columns=columnDict, inplace=True)
df = df.groupby("hash").agg(list)

df = getUniqueWordsColumn(df, "ngram_lc", "ngram_words", nonWords=non_words)
df.head()

Unnamed: 0_level_0,ngram,ngram_lc,ngram_num_tokens,ngram_count,term_freq,doc_count,ngram_words
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3002e8a37ec9d00a67bdf0004b8628c35d72068d,"[antediluvian, antediluvian humanity]","[antediluvian, antediluvian humanity]","[1, 2]","[1, 1]","[0.0000097323600973236, 0.0000097323600973236]","[1, 1]","[antediluvian, humanity]"
3005b3bf055ddcb3c25e4742a72ee16728934efd,"[antediluvian, antediluvian refrain, follow by...","[antediluvian, antediluvian refrain, follow by...","[1, 2, 4, 5]","[1, 1, 1, 1]","[0.00028050490883590464, 0.0002805049088359046...","[1, 1, 1, 1]","[antediluvian, refrain, follow]"
3005ebfe5508340797dbfcce8454f3d3f6f76eb1,"[antediluvian, antediluvian dream, cave of -PR...","[antediluvian, antediluvian dream, cave of -pr...","[1, 2, 4, 5, 5]","[1, 1, 1, 1, 1]","[0.00009109127345600292, 0.0000910912734560029...","[1, 1, 1, 1, 1]","[cave, antediluvian, dream, mammoth]"
30064ae161de1e9a96992be108c195796f13e72a,"[Hennig86 program, routine in the Hennig86, ro...","[hennig86 program, routine in the hennig86, ro...","[2, 4, 5, 1]","[1, 1, 1, 1]","[0.00019790223629527012, 0.0001979022362952701...","[1, 1, 1, 1]","[program, hennig86, routine]"
30136ab3788ab8e8be6b939901ec669a41ef896a,[antediluvian],[antediluvian],[1],[1],[0.00005075111652456354],[1],[antediluvian]


### Performance Testing

In [7]:
# t = Timer("buildAdjacencyMatrixByColumn(df, \"ngram_words\")", "from __main__ import buildAdjacencyMatrixByColumn, df")
# t.timeit(number=100)

### ML

In [8]:
# to do

### Visualizations

In [9]:
cols_to_try = ["ngram_lc", "ngram_words"]
filename = "GIgraph_test_"
for col in cols_to_try:
    adjmatrix = buildAdjacencyMatrixByColumn(df, col)
    visualizeNetworkHTML(adjMatrix=adjmatrix, filename=filename+col+".html")