In [1]:
%%capture
import pickle
import tqdm
#import pandas as pd
import networkx as nx
import cudf as pd
import nx_cugraph as cnx

In [3]:
# Set up folders
import os
if not os.path.exists('data'):
    os.makedirs('data')
if not os.path.exists('figs'):
    os.makedirs('figs')
if not os.path.exists('pickles'):
    os.makedirs('pickles')

In [4]:
def check_cached(path):
    if os.path.exists(path):
        return True
    return None

In [5]:
# Load data
df = pd.read_csv('data/SummaryIndividuals.csv')
df.head(5)

Unnamed: 0,ID,parent,created,lifeSpan,speed,maxEnergy,kidEnergy,sensors,ancestor,nkids,pgmDeath
0,355,12,0,2,2.0,3,4,5,355,2,20
1,82,2,0,2,2.0,4,1,5,82,3,20
2,100,2,0,2,2.0,3,1,5,100,3,20
3,108,2,0,2,2.0,3,1,5,108,3,20
4,114,2,0,2,2.0,3,1,5,114,3,20


In [6]:
# Create graph
G = nx.DiGraph(backend='cugraph')
G.add_edges_from(df.to_pandas()[['parent', 'ID']].itertuples(index=False), backend='cugraph')

AttributeError: module 'networkx' has no attribute 'info'

In [7]:
print(f'Number of nodes: {G.number_of_nodes()}')
print(f'Number of edges: {G.number_of_edges()}')

Number of nodes: 34112607
Number of edges: 34112456


In [10]:
# Step 2: Find the longest path for each node
def longest_path_length(G, node):
    lengths = nx.single_source_dijkstra_path_length(G, node)
    return max(lengths.values()) if lengths else 0

path_cache = 'pickles/path_lengths.pkl'

if check_cached(path_cache) is None:
    print('Calculating path lengths')
    path_lengths = {node: longest_path_length(G, node) for node in tqdm.tqdm(G.nodes)}
    with tables.open_file(path_cache, mode='w') as f:
        pickle.dump(path_lengths, f, protocol=pickle.HIGHEST_PROTOCOL)
else:
    print('Loading path lengths')
    with tables.open_file(path_cache, mode='r') as f:
        path_lengths = pickle.load(f)

Calculating path lengths



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

NameError: name 'tables' is not defined

In [12]:
with open(path_cache, mode='wb') as f:
    pickle.dump(path_lengths, f, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
# Step 3: Sort nodes by path length and select the top 10%
top_10_cache = 'pickles/top_10_percent_nodes.pkl'

if check_cached(top_10_cache) is None:
    print('Sorting nodes')
    sorted_nodes = sorted(path_lengths, key=path_lengths.get, reverse=True)
    top_10_percent_length = int(len(sorted_nodes) * 0.1)
    top_10_percent_nodes = set(sorted_nodes[:top_10_percent_length])
    with open(top_10_cache, mode='wb') as f:
        pickle.dump(top_10_percent_nodes, f, protocol=pickle.HIGHEST_PROTOCOL)
else:
    print('Loading nodes')
    with open(top_10_cache, mode='rb') as f:
        top_10_percent_nodes = pickle.load(f)

Sorting nodes


In [21]:
# Step 4: Extract all nodes involved in these top 10% longest lineages
def extract_lineage_iterative(G, nodes):
    extracted = set()
    stack = list(nodes)
    
    while stack:
        node = stack.pop()
        if node not in G:
            extracted.add(node)
            continue
        extracted.add(node)
        stack.extend(G.successors(node))
    
    return extracted

lineage_cache = 'pickles/extracted_nodes.pkl'

# Load or compute the extracted nodes
if check_cached(lineage_cache) is None:
    print('Extracting lineages')
    extracted_nodes = set()
    for node in tqdm.tqdm(top_10_percent_nodes):
        extracted_nodes.update(extract_lineage_iterative(G, [node]))
    with open(lineage_cache, mode='wb') as f:
        pickle.dump(extracted_nodes, f, protocol=pickle.HIGHEST_PROTOCOL)
else:
    print('Loading lineages')
    with open(lineage_cache, mode='rb') as f:
        extracted_nodes = pickle.load(f)

Extracting lineages



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [22]:
filtered_cache = 'data/filtered.csv'

if check_cached(filtered_cache) is not None:
    print('Loading filtered data')
    with open(filtered_cache, mode='r') as f:
        filtered_df = pd.read_csv(filtered_cache)
else:
    print('Creating filtered data')
    filtered_df = df[df['ID'].isin(extracted_nodes)]
    with open(filtered_cache, mode='w') as f:
        filtered_df.write_csv(filtered_cache, index=False)

Creating filtered data


AttributeError: DataFrame object has no attribute write_csv