In [None]:
from pathlib import Path

import networkx as nx
import pandas as pd

from networkx.algorithms.traversal import bfs_tree

"""
Load data from file
"""

# set up some paths for ins and outs
base_directory = Path.cwd().parent.absolute()
source_data_path = base_directory / "data" / "raw" / "etymologies.tsv"
processed_directory = base_directory / "data" / "processed"
output_graph_yaml_file = processed_directory / "graph_output.yaml"
output_graph_dot_file = processed_directory / "graph_output.dot"
output_graph_png = processed_directory / "graph_output.png"

# load from CSV
cleaned_df = pd.read_csv(
    source_data_path, sep="\t", names=["source_node", "edge_type", "target_node"]
)



# Cleaning notes

## Relationship types found in source data

Relationships are recorded bidirectionally. So a root word will link to its derivatives, and each derivative will link back to the root. To simplify the graph,
I will drop edges that point from derivatives to roots, since that information is already encoded in the root-to-leaf edge and networkx can handle bidirectional
traversal without requiring multiple edges to link the same pair of nodes.

Below are the types of relationships extracted from the data, with comments indicating their directionality.


<- A is the source of B

-> B is the source of A


- "rel:etymology"               ->
- "rel:etymological_origin_of"  <-
- "rel:is_derived_from"         ->
- "rel:has_derived_form"        <-
- "rel:etymologically_related"  <->
- "rel:variant:orthography"     <->

source data also includes a handful of malformed values, which should be dropped or replaced
- "rel:etymologically" -> "rel:etymologically_related"
- "rel:derived" -> "rel:is_derived_from"



In [None]:

"""
Clean data
"""

# filter out bidirectional relationships and select one directionality to normalize the graph
# I would normally clean to fix a handful of malformed tags as below but we are dropping those edge types anyway
# so instead we will stick to the edge types that point from root words to derived words
root_first_rel_types = ["rel:etymological_origin_of", "rel:has_derived_form"]
cleaned_df = cleaned_df.loc[(cleaned_df["edge_type"].isin(root_first_rel_types))]
cleaned_df[["source_language", "source_word"]] = cleaned_df.source_node.str.split(
    ": ", expand=True
)

# there are a handful of nodes that include strange characters or a :Category: tag that introduces a third
# column for no reason. This data is uninteresting so we can just ignore it and no include it in the graph when we construct it
cleaned_df[["target_language", "target_word", "crud"]] =  cleaned_df.target_node.str.split(": ", expand=True)


In [None]:

"""
Get list of unique languages in data set.

Not super necessary but helpful for understanding the likely subgraph structure. I would guess that the individual
languages will be highly connected/clustered. I suspect the boundaries will blur a bit around the proto- and ancient languages, particularly for languages with
many ancestors in the data set, e.g., Latin
"""

unique_languages = set(cleaned_df["source_language"].unique()).union(set(cleaned_df["target_language"].unique()))
print(sorted(unique_languages))


In [None]:

"""
Construct networkx graph
"""

# start with directed so we can preserve directionality data, retaining the option to convert to undirected later to use networkx undirected algorithms
graph = nx.from_pandas_edgelist(
    cleaned_df,
    edge_attr=[
        "edge_type",
        "source_language",
        "source_word",
        "target_language",
        "target_word",
    ],
    source="source_node",
    target="target_node",
    create_using=nx.DiGraph,
)
print(nx.info(graph))



In [None]:
"""
Undirected graph analysis

Construct undirected graph
"""

# convert to undirected so we can apply undirected algorithms
undirected_graph = graph.to_undirected()
print(nx.info(undirected_graph))


In [None]:
"""
Undirected graph analysis

Check for connected components
"""

from networkx.algorithms.components import connected_components

# check for connected components. may be a way to prune subgraphs that do not relate to English etymology
conn_components = list(connected_components(undirected_graph))
connected_component_count = len(conn_components)
print(connected_component_count)


In [None]:
"""
Filter to English nodes and their BFS trees


Interestingly, these results do not seem to match what was produced by the DAG root approach above.


Type: DiGraph
Number of nodes: 912653
Number of edges: 715717
Average in degree:   0.7842
Average out degree:   0.7842

Type: DiGraph
Number of nodes: 433240
Number of edges: 425233
Average in degree:   0.9815
Average out degree:   0.9815



"""

# grab the nodes that have the eng tag, then build the connected graph from those
bfs_english_nodes = [n for n in graph.nodes() if n[0:3] == 'eng']
bfs_nodes_to_add = []
# add nodes inside loop to avoid having to flatten later
[bfs_nodes_to_add.extend(bfs_tree(graph, source=node)) for node in bfs_english_nodes]

bfs_english_graph = graph.subgraph(bfs_nodes_to_add)
print(nx.info(bfs_english_graph))


In [None]:
"""
Check most connected nodes

So... this does not match the DAG results, but in an intriguing way. These are exclusively English words, whereas the DAG roots included a mix of other languages as well.

Some nodes are the same (e.g., "eng: non-"). But the degrees are slightly different. 5746 here versus 5748 above for "non-", for example.

I wonder why this approach is less inclusive than the above...

"""

bfs_nodes_by_degree = sorted(bfs_english_graph.degree, key=lambda x: x[1], reverse=True)
print(bfs_nodes_by_degree[0:99])

In [None]:
"""
Let's try to plot something, because why not?


import matplotlib.pyplot as plt

highly_connected_node = nodes_by_degree[-1]
# nodes_in_sub = bfs_tree(graph, source="eng: leaf")
nodes_in_sub = nx.descendants(graph, "eng: leaf")
graph_to_plot = graph.subgraph(nodes_in_sub)

nx.draw(graph_to_plot, with_labels=True)

nx.draw_shell(graph_to_plot)
plt.show()

Well that looks like my 3rd grade art work...

Let's try with edges and such

nx.draw_networkx(graph_to_plot, arrows=True, with_labels=True)
nx.draw_shell(graph_to_plot)
plt.show()

Ok time to be lazy and ~~steal~~ borrow from the networkx example drawings

Need to keep it to a smaller graph because it just plots as giant unreadable mess with many nodes


"""

import matplotlib.pyplot as plt
import numpy as np

undirected_graph_to_plot = graph.subgraph(nx.descendants(graph, "eng: leaf")).to_undirected()

degree_sequence = sorted([d for n, d in undirected_graph_to_plot.degree()], reverse=True)
dmax = max(degree_sequence)

fig = plt.figure("Degree of node 'eng: leaf' descendants", figsize=(8, 8))
axgrid = fig.add_gridspec(5, 4)

ax0 = fig.add_subplot(axgrid[0:3, :])
Gcc = undirected_graph_to_plot.subgraph(sorted(nx.connected_components(undirected_graph_to_plot), key=len, reverse=True)[0])
pos = nx.spring_layout(Gcc, seed=10396953)
nx.draw_networkx_nodes(Gcc, pos, ax=ax0, node_size=20)
nx.draw_networkx_edges(Gcc, pos, ax=ax0, alpha=0.4)
ax0.set_title("Connected components of 'eng: leaf' descendants")
ax0.set_axis_off()

ax1 = fig.add_subplot(axgrid[3:, :2])
ax1.plot(degree_sequence, "b-", marker="o")
ax1.set_title("Degree Rank Plot")
ax1.set_ylabel("Degree")
ax1.set_xlabel("Rank")

ax2 = fig.add_subplot(axgrid[3:, 2:])
ax2.bar(*np.unique(degree_sequence, return_counts=True))
ax2.set_title("Degree histogram")
ax2.set_xlabel("Degree")
ax2.set_ylabel("# of Nodes")

fig.tight_layout()
plt.show()

In [None]:
"""
Maybe let's try a few ego graphs to see if that can highlight why the two approaches are different
"""
bfs_ego = nx.ego_graph(bfs_english_graph, bfs_nodes_by_degree[0][0])
dag_ego = nx.ego_graph(english_graph, nodes_by_degree[0][0])

# Draw graph
pos = nx.spring_layout(bfs_ego, seed=20532)  # Seed layout for reproducibility
nx.draw(bfs_ego, pos, node_color="b", node_size=50, with_labels=True)

# Draw ego as large and red
options = {"node_size": 300, "node_color": "r"}
nx.draw_networkx_nodes(bfs_ego, pos, nodelist=[bfs_nodes_by_degree[0][0]], **options)
plt.show()