<a href="https://colab.research.google.com/github/john-decker/lkp_project/blob/main/LKP_Connectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import networkx as nx
from networkx import NetworkXException
import matplotlib.pyplot as plt
from operator import itemgetter

#for Bokeh plot, see: http://docs.bokeh.org/en/1.3.2/docs/user_guide/graph.html
from bokeh.plotting import figure, show, from_networkx
from bokeh.models import BoxZoomTool, Circle, HoverTool, MultiLine, Plot, Range1d, ResetTool, EdgesAndLinkedNodes, NodesAndLinkedEdges, LayoutDOM, ColumnDataSource, DataTable, TableColumn
from bokeh.layouts import row, column, layout, gridplot
from bokeh.palettes import YlOrBr, RdGy, Blues8, Reds8, Category20c
from bokeh.transform import linear_cmap
from bokeh.io import output_file, save
from bokeh.resources import CDN
from bokeh.embed import file_html


In [14]:
#import entity csv files
base_file_path = '/content/person_appear_in_doc.csv'
person_file_path = '/content/person.csv'
occupation_file_path = '/content/occupation.csv'

#create dataframes for each entity csv
base_df = pd.read_csv(base_file_path)
person_df = pd.read_csv(person_file_path)
occupation_df = pd.read_csv(occupation_file_path)

In [15]:
#merge csv files to create a joined dataframe with needed information
#use merge technique. See: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
person_occupation_df = pd.merge(person_df, occupation_df, how="left", on="occupation_id")
person_appearance_df = pd.merge(base_df, person_occupation_df, how="left", on="person_id")

In [16]:
def get_full_name(person_first, person_last):
	person = ""
	if person_first and person_last:
		person = person_first + " " + person_last
	if person_first and not person_last:
		person = person_first
	if person_last and not person_first:
		person = person_last
	return person

In [17]:
#categories to capture high, medium, and low status workers
#use to test networked status vs social status
high_status_id = [2, 10, 12, 15, 21, 23, 28, 32, 36, 38, 39, 43, 48, 49, 50, 51]
medium_status_id = [3, 4, 5, 7, 8, 11, 14, 16, 17, 18, 19, 20, 24, 25, 26, 27, 29, 31, 34, 37, 41, 42, 45, 52]

high_status_individuals=[]
medium_status_individuals=[]
low_status_individuals=[]

person_occupation_df.person_fname = person_occupation_df.person_fname.fillna('')
person_occupation_df.person_lname = person_occupation_df.person_lname.fillna('')
for index, entry in person_occupation_df.iterrows():
	person_id = entry['person_id']
	occupation_id = entry['occupation_id']
	occupation_desc = entry['occupation_desc']
	person_fname = entry['person_fname']
	person_lname = entry['person_lname']
	person_first = ''
	person_last = ''
	if person_fname != '':
		person_first = str(person_fname)
	if person_lname != '':
		person_last = str(person_lname)

	person_full_name = get_full_name(person_first, person_last)

	person = person_id, occupation_desc.capitalize(), person_full_name

	if occupation_id in high_status_id:
		high_status_individuals.append(person)
		# print(person_full_name,":", occupation_desc, "= HIGH")
	if occupation_id in medium_status_id:
		medium_status_individuals.append(person)
		# print(person_full_name,":", occupation_desc, "= Medium")
	if occupation_id not in medium_status_id and occupation_id not in high_status_id:
		low_status_individuals.append(person)
		# print(person_full_name,":", occupation_desc, "= low")

In [18]:
#make a new dataframe from the joined csv files that has the following information:
#person_id, person_fname, person_lname, person_lname, manuscript, fol_num
#graph nodes will be person_id, node attributes will be person_fname, person_lname, occupation_desc
#graph edges will be fol_num, edge attributes will be manuscript
graph_df = pd.DataFrame(person_appearance_df, columns=['person_id', 'person_fname', 'person_lname', 'occupation_desc', 'manuscript', 'fol_num'])

#replace NAN with empty string using method from: https://stackoverflow.com/questions/26837998/pandas-replace-nan-with-blank-empty-string
graph_df.person_fname = graph_df.person_fname.fillna('')

In [19]:
#start building nodes
node_dict = {'person_fname': '',
	'person_lname': '',
	'occupation_desc': ''}


node_dict_list = []

for i in range(len(graph_df)) :
	# print(graph_df.loc[i, "person_fname"], graph_df.loc[i, "person_lname"], graph_df.loc[i, "occupation_desc"])
	# node_dict['person_id'] = graph_df.loc[i, "person_id"]
	person = graph_df.loc[i, "person_id"]
	#must convert id from numpy.int64 to regular int for Json serialization to work in Bokeh
	person = int(person.item())
	node_dict['person_fname'] = graph_df.loc[i, "person_fname"]
	node_dict['person_lname'] = graph_df.loc[i, "person_lname"]
	node_dict['occupation_desc'] = graph_df.loc[i, "occupation_desc"]

	#create tuple of values for networkx function
	new_entry = (person, node_dict)
	node_dict_list.append(new_entry)

	node_dict = {'person_fname': '',
	'person_lname': '',
	'occupation_desc': ''}

In [20]:
#function for creating edge relationships
def create_edge_rels(fol_list, number):
	edge_dict = {number: []}
	edge_list = []
	for i in range(0, len(fol_list)-1):
		for item in list(range(len(fol_list))):
			new_entry = (fol_list[i], fol_list[item], {'fol_num': number})
			edge_list.append(new_entry)
		edge_dict[number]=edge_list

	return edge_dict

#isolate rows by folio in column labeled 'fol_num'
entry_list = []
for entry in graph_df['fol_num']:
	entry_list.append(entry)

#create a set of the fol_num entries to get rid of duplicates
final_list = set(entry_list)

#sort the set for faster searching in the next step
final_list = sorted(final_list)

# get person_id entries from subsetted rows
full_edge_list = []
working_edge_dict = {}
for entry in final_list:
	new_df =  graph_df.loc[graph_df['fol_num'] == entry]
	id_list=list((new_df.person_id.values))
	#send to function for pairing all entries to form edge relationships
	edge_rel_list = create_edge_rels(id_list, entry)
	working_edge_dict = edge_rel_list
	full_edge_list.append(working_edge_dict)
	working_edge_dict = {}

In [21]:
#tutorial for networkx at: https://networkx.org/documentation/stable/tutorial.html
#create graph
G = nx.Graph()
# G = nx.petersen_graph()
# G = nx.tutte_graph()
# G=nx.sedgewick_maze_graph()
# G=nx.tetrahedral_graph()

#Add nodes here
G.add_nodes_from(node_dict_list)
# print(list(G.nodes))
# print(G.number_of_nodes())
# print(list(G.nodes.items()))

#Add edges here
#get to tuples in dict
for entry in full_edge_list:
	for item in entry.values():
		G.add_edges_from(item)

#remove self loops using method from https://stackoverflow.com/questions/49427638/removing-self-loops-from-undirected-networkx-graph
G.remove_edges_from(nx.selfloop_edges(G))

#get degree for single node
#use for looking at 'locality'
# print(G.degree(132))

221


In [22]:
#using approach from https://programminghistorian.org/en/lessons/exploring-and-analyzing-network-data-with-python
degree_dict = dict(G.degree(G.nodes()))
sorted_degree = sorted(degree_dict.items(), key=itemgetter(1), reverse=True)

In [23]:
def get_connection_level(start, stop, search_dict):
	connections = []
	for entry in search_dict[start:stop]:
		connections.append(entry)
	return connections

#create top, medium, low, and minimal connectors
#top = 50+ connections
#medium = 30-49 connections
#low = 10-29 connections
#minimal = 0-9 connections
top_connectors = get_connection_level(0, 22, sorted_degree)
medium_connectors = get_connection_level(23, 72, sorted_degree)
low_connectors = get_connection_level(73, 384, sorted_degree)
minimal_connectors = get_connection_level(385, -1, sorted_degree)

In [24]:
def get_connect_counts(target_list):
	count = 0
	for entry in target_list:
		new_number = entry[1]
		count += new_number
	return count

top_count = get_connect_counts(top_connectors)
med_count = get_connect_counts(medium_connectors)
low_count = get_connect_counts(low_connectors)
minim_count = get_connect_counts(minimal_connectors)

top_ratio = top_count/len(top_connectors)
med_ratio = med_count/len(medium_connectors)
low_ratio = low_count/len(low_connectors)
minim_ratio = minim_count/len(minimal_connectors)

In [25]:
def get_connectors_by_level(level_list, connectors_list, status):
	connection_levels_list = []
	person = ''
	for i in range(len(level_list)):
		for j in range(len(connectors_list)):
			if connectors_list[j][0] == level_list[i][0]:
				# print(level_list[i][0], level_list[i][1], level_list[i][2],
				# 	connectors_list[j][1])
				person = (level_list[i][0], level_list[i][1],
				level_list[i][2],connectors_list[j][1], status.capitalize())

				connection_levels_list.append(person)
	return(connection_levels_list)

In [26]:
low_high_connections = get_connectors_by_level(low_status_individuals, top_connectors, "low")
low_medium_connections = get_connectors_by_level(low_status_individuals, medium_connectors, "low")
low_low_connections = get_connectors_by_level(low_status_individuals, low_connectors, "low")
medium_high_connections = get_connectors_by_level(medium_status_individuals, top_connectors, "medium")
medium_medium_connections = get_connectors_by_level(medium_status_individuals, medium_connectors, "medium")
medium_low_connections = get_connectors_by_level(medium_status_individuals, low_connectors, "medium")
high_low_connections = get_connectors_by_level(high_status_individuals, low_connectors, "high")
high_medium_connections = get_connectors_by_level(high_status_individuals, medium_connectors, "high")

med_med_connections = get_connectors_by_level(medium_status_individuals, medium_connectors, "medium")

In [27]:
#create dataframe for data table
#bokeh docs at: https://docs.bokeh.org/en/latest/docs/reference/models/widgets/tables.html?highlight=data%20table#bokeh.models.DataTable.autosize_mode
#using underlying approach from: https://www.youtube.com/watch?v=s01lMojtl4Y
connector_low_high_df = pd.DataFrame(low_high_connections, columns=['person_id', 'occupation_desc', 'name', 'degree', 'social_status'])
connector_low_med_df = pd.DataFrame(low_medium_connections, columns=['person_id', 'occupation_desc', 'name', 'degree', 'social_status'])
connector_low_low_df = pd.DataFrame(low_low_connections, columns=['person_id', 'occupation_desc', 'name', 'degree', 'social_status'])
connector_med_high_df = pd.DataFrame(medium_high_connections, columns=['person_id', 'occupation_desc', 'name', 'degree', 'social_status'])
connector_med_med_df = pd.DataFrame(medium_medium_connections, columns=['person_id', 'occupation_desc', 'name', 'degree', 'social_status'])
connector_med_low_df = pd.DataFrame(medium_low_connections, columns=['person_id', 'occupation_desc', 'name', 'degree', 'social_status'])
connector_high_low_df = pd.DataFrame(high_low_connections, columns=['person_id', 'occupation_desc', 'name', 'degree', 'social_status'])
connector_high_medium_df = pd.DataFrame(high_medium_connections, columns=['person_id', 'occupation_desc', 'name', 'degree', 'social_status'])
connector_high_low_20_df = connector_high_low_df.loc[connector_high_low_df['degree'] >= 20]

In [82]:
working_frames = [connector_low_high_df, connector_low_med_df, connector_low_low_df, connector_med_high_df, connector_med_med_df, connector_med_low_df, connector_high_medium_df, connector_high_low_df]
connector_table_data = pd.concat(working_frames)

source_data = ColumnDataSource(connector_table_data)


columns = [
	TableColumn(field="person_id", title="Person ID"),
	TableColumn(field="occupation_desc", title="Occupation"),
	TableColumn(field="name", title="Name"),
	TableColumn(field="degree", title="Connections"),
	TableColumn(field="social_status", title="Social Status"),
	]

data_table_all = DataTable(source=source_data, columns=columns,
	index_position=None, autosize_mode='fit_viewport',
	sizing_mode='scale_width', sortable=True, reorderable=True)

layout_connection_tables = gridplot(
    [
    	[data_table_all],
        # [data_table1],[data_table4],
        # [data_table3,data_table4],
    ],
    # toolbar_location="left",
)

layout_connection_tables.width=600
layout_connection_tables.max_width=600
layout_connection_tables.width_policy="min"
layout_connection_tables.height_policy="min"
layout_connection_tables.sizing_mode="stretch_width"

type(layout_connection_tables)

# show(layout_connection_tables)

# output_file(filename="LKP_Connectors_Table_All.html", title="Static HTML file")

# save(layout_connection_tables)

In [75]:
connector_table_data

Unnamed: 0,person_id,occupation_desc,name,degree,social_status
Loading ITables v2.2.1 from the internet... (need help?),,,,,


In [83]:
#from: https://marc-wouts.medium.com/pandas-dataframes-as-interactive-html-datatables-9737c7266abf
!pip install itables
from itables import init_notebook_mode, show
init_notebook_mode(all_interactive=True)



In [84]:
show(connector_table_data)

Unnamed: 0,person_id,occupation_desc,name,degree,social_status
Loading ITables v2.2.1 from the internet... (need help?),,,,,


In [85]:
#from: https://stackoverflow.com/questions/65328948/export-a-pandas-dataframe-to-a-sortable-table-in-html
#produces interactive table in html!!!
import panel as pn
connector_table_data = pn.widgets.Tabulator(connector_table_data)

connector_table_data.save("connectors_table.html")