In [None]:
"""
Set up imports and constants
"""

from pathlib import Path

from polyphasia.loader import load_to_pandas, clean_data_frame
from polyphasia.constants import RELATIVE_PATH_TO_SOURCE
from polyphasia.graph import UndirectedGraph

# set up some paths for ins and outs
base_directory = Path.cwd().parent.absolute()
source_data_path = base_directory / RELATIVE_PATH_TO_SOURCE

In [2]:
"""
Load and clean data
"""

data_frame = load_to_pandas(source_data_path)
data_frame = clean_data_frame(data_frame)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [3]:
"""
Construct networkx graph
"""

# start with directed so we can preserve directionality data, retaining the option to convert to undirected later to use networkx undirected algorithms
graph = UndirectedGraph(data_frame)
print(graph.info)


Name: 
Type: Graph
Number of nodes: 2743118
Number of edges: 2690896
Average degree:   1.9619


In [4]:
"""
Undirected graph analysis

Check for connected components
"""
# check for connected components. may be a way to prune subgraphs that do not relate to English etymology
conn_components = graph.connected_components
connected_component_count = len(conn_components)
print(connected_component_count)


209375


In [None]:
"""
Filter to English nodes and their BFS trees


Interestingly, these results do not seem to match what was produced by the DAG root approach.


Type: DiGraph
Number of nodes: 912653
Number of edges: 715717
Average in degree:   0.7842
Average out degree:   0.7842

Type: DiGraph
Number of nodes: 433240
Number of edges: 425233
Average in degree:   0.9815
Average out degree:   0.9815



"""
bfs_english_nodes = graph.language_nodes("eng")
bfs_english_graph = graph.language_subgraph("eng", bfs_english_nodes)
print(UndirectedGraph.subgraph_info(bfs_english_graph))


In [None]:
"""
Check most connected nodes

So... this does not match the DAG results, but in an intriguing way. These are exclusively English words, whereas the DAG roots included a mix of other languages as well.

Some nodes are the same (e.g., "eng: non-"). But the degrees are slightly different. 5746 here versus 5748 above for "non-", for example.

I wonder why this approach is less inclusive than the DAG approach...

"""
nodes_by_degree = UndirectedGraph.nodes_by_degree(bfs_english_graph)
print(nodes_by_degree[0:99])