In [10]:
"""
Set up imports and constants
"""

from pathlib import Path

from polyphasia.loader import load_to_pandas, clean_data_frame
from polyphasia.constants import RELATIVE_PATH_TO_SOURCE
from polyphasia.graph import DirectedGraph

# set up some paths for ins and outs
base_directory = Path.cwd().parent.absolute()
source_data_path = base_directory / RELATIVE_PATH_TO_SOURCE

In [2]:
"""
Load and clean data
"""

data_frame = load_to_pandas(source_data_path)
data_frame = clean_data_frame(data_frame)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [3]:
"""
Construct networkx graph
"""

# start with directed so we can preserve directionality data, retaining the option to convert to undirected later to use networkx undirected algorithms
graph = DirectedGraph(data_frame)
print(graph.info)

Name: 
Type: DiGraph
Number of nodes: 2743118
Number of edges: 2692096
Average in degree:   0.9814
Average out degree:   0.9814


# What next?

Now that we've loaded the data into a graph, it's time to do some actual analysis. But to do so, we need to define the problem more clearly. Otherwise, there is
an intractable amount of data for many graph algorithms. Pruning to relevant subgraphs would be desirable as an initial post-processing step.

I am primarily interested in English language entries. However, ~~many~~ all of the English words are derived from non-English words. It would be good to prune
entries that are not etymological roots of English words.

My initial instinct is to trim any descendant nodes of English words that are in other languages. Then, we can run BFS from each English node to gather its
ancestors, knowing the descendant nodes have already been trimmed. This could accidentally exclude relevant data if there are derivation paths that jump from
English to another language and then back, but that's an interesting question in and of itself and might be worth investigating as preliminary matter before
pursuing this approach.

However, I suspect the BFS approach may be extremely inefficient and that it will be necessary to reduce the size of the search space to something more
tractable.

So. What exactly are we looking for?

- Nodes with the "eng:" prefix
- Nodes that are direct and indirect ancestors of the English nodes



In [4]:
"""
DAG analysis

Check if graph is directed acyclic
"""

is_dag = graph.is_dag
print(is_dag)


False


In [5]:
"""
DAG analysis

I expected that the graph would be acyclic, because logically it doesn't make sense for a word to be an ancestor of a word that is also the first word's ancestor.
However, is_directed_acyclic_graph returned false, so I am going to check for cycles and see if there's something in the source data that can be cleaned up to
enable DAG analysis
"""
cycles = graph.get_cycles()
cycle_count = len(cycles)
print(cycle_count)


3305


In [6]:
"""
DAG analysis

So... looks like all the cycles are self-connected single nodes. I'm guessing if I prune those it will be acyclic.

Tried this, and still there were some loops left. they were in somewhat obscure languages so I decided to just drop them all.
graph.remove_edges_from(nx.selfloop_edges(graph))

"""
graph.remove_cycles()
is_dag = graph.is_dag
print(is_dag)


True


In [7]:
"""
DAG analysis

Ok we have a DAG, time to see what kinds of fun patterns we can find...

"""
longest_path = graph.get_longest_path()
print(longest_path)


['peo: 𐏋', 'pal: 𐭱𐭠𐭤', 'fas: شاه', 'fas: چک', 'ara: صکّ', 'lat: scacus', 'fro: eschec', 'eng: check', 'eng: checked', 'eng: unchecked', 'eng: unch', 'eng: unches']


In [8]:
"""
DAG analysis

Create English-related subgraph

Let's see if we can exploit the DAG properties to filter to chains with English words only
"""
roots = graph.roots()
print(len(roots))
english_nodes = graph.language_nodes("eng")
print(len(english_nodes))
english_graph = graph.language_subgraph("eng", english_nodes)
DirectedGraph.subgraph_info(english_graph)


342736
96374


'Name: \nType: DiGraph\nNumber of nodes: 912653\nNumber of edges: 715717\nAverage in degree:   0.7842\nAverage out degree:   0.7842'

In [9]:
"""
English graph analysis

Inspect highest degree nodes
"""
nodes_by_degree = DirectedGraph.nodes_by_degree(english_graph)
print(nodes_by_degree[0:99])


[('eng: non-', 5748), ('eng: -ly', 5439), ('eng: un-', 4206), ('eng: -like', 2438), ('eng: -er', 2346), ('eng: -less', 1974), ('ita: -mente', 1666), ('eng: -able', 1428), ('eng: time', 1322), ('eng: anti-', 1293), ('eng: -y', 1227), ('eng: poly-', 1004), ('ita: -abile', 880), ('eng: -ic', 812), ('eng: -ed', 754), ('eng: multi-', 715), ('eng: over-', 710), ('spa: -mente', 681), ('fra: -ure', 665), ('eng: -ian', 625), ('eng: -ish', 588), ('eng: -ally', 520), ('eng: -ize', 500), ('eng: dog', 496), ('eng: disease', 493), ('deu: Stein', 485), ('eng: inter-', 461), ('fra: -er', 451), ('eng: hyper-', 445), ('eng: bird', 444), ('fra: -ique', 442), ('ita: -oso', 441), ('eng: be-', 428), ('eng: in-', 426), ('fra: -age', 424), ('ita: -tura', 419), ('eng: micro-', 401), ('eng: de-', 397), ('eng: post-', 395), ('eng: -an', 392), ('eng: -ing', 383), ('eng: -ity', 383), ('eng: -ship', 380), ('gle: -acht', 376), ('ita: -mento', 356), ('eng: Chinese', 346), ('eng: semi-', 327), ('eng: back', 326), ('it