In [1]:
import pandas as pd
import networkx as nx
import pickle
import tqdm
import tables

In [2]:
# Set up folders
import os
if not os.path.exists('data'):
    os.makedirs('data')
if not os.path.exists('figs'):
    os.makedirs('figs')
if not os.path.exists('pickles'):
    os.makedirs('pickles')

In [3]:
def check_cached(filename):
    if os.path.exists(filename):
        return pd.read_pickle(filename)
    return None

In [4]:
# Load data
df = pd.read_csv('data/SummaryIndividuals.csv')
df.head(5)

Unnamed: 0,ID,parent,created,lifeSpan,speed,maxEnergy,kidEnergy,sensors,ancestor,nkids,pgmDeath
0,355,12,0,2,2.0,3,4,5,355,2,20
1,82,2,0,2,2.0,4,1,5,82,3,20
2,100,2,0,2,2.0,3,1,5,100,3,20
3,108,2,0,2,2.0,3,1,5,108,3,20
4,114,2,0,2,2.0,3,1,5,114,3,20


In [9]:
with tables.open_file('pickles/G.h5', mode='w') as f:
    pickle.dump(G, f)

TypeError: file must have a 'write' attribute

In [6]:
# Create a directed graph
G_cache = 'pickles/G.h5'
if not check_cached(G_cache):
    print('Creating graph')
    G = nx.DiGraph()
    for _, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
        G.add_edge(row['parent'], row['ID'])
    with tables.open_file(G_cache, mode='w') as f:
        pickle.dump(G, f)
else:
    print('Loading graph')
    with tables.open_file(G_cache, mode='r') as f:
        G = nx.read_gpickle(f)

Creating graph


100%|██████████| 34112456/34112456 [06:54<00:00, 82281.99it/s]


AttributeError: module 'networkx' has no attribute 'write_gpickle'

In [None]:
# Step 2: Find the longest path for each node
def longest_path_length(G, node):
    lengths = nx.single_source_dijkstra_path_length(G, node)
    return max(lengths.values()) if lengths else 0

path_cache = 'pickles/path_lengths.pkl'

if check_cached(path_cache) is None:
    print('Calculating path lengths')
    path_lengths = {node: longest_path_length(G, node) for node in tqdm.tqdm(G.nodes)}
    with tables.open_file(path_cache, mode='w') as f:
        pickle.dump(path_lengths, f, protocol=pickle.HIGHEST_PROTOCOL)
else:
    print('Loading path lengths')
    with tables.open_file(path_cache, mode='r') as f:
        path_lengths = pickle.load(f)

In [None]:
# Step 3: Sort nodes by path length and select the top 10%
top_10_cache = 'pickles/top_10_percent_nodes.pkl'

if check_cached(top_10_cache) is None:
    print('Sorting nodes')
    sorted_nodes = sorted(path_lengths, key=path_lengths.get, reverse=True)
    top_10_percent_length = int(len(sorted_nodes) * 0.1)
    top_10_percent_nodes = set(sorted_nodes[:top_10_percent_length])
    with tables.open_file(top_10_cache, mode='w') as f:
        pickle.dump(top_10_percent_nodes, f, protocol=pickle.HIGHEST_PROTOCOL)
else:
    print('Loading nodes')
    with tables.open_file(top_10_cache, mode='r') as f:
        top_10_percent_nodes = pickle.load(f)

In [None]:
# Step 4: Extract all nodes involved in these top 10% longest lineages
def extract_lineage(G, node, extracted):
    if node not in G:
        extracted.add(node)
        return
    extracted.add(node)
    for child in G.successors(node):
        extract_lineage(G, child, extracted)

lineage_cache = 'pickles/extracted_nodes.pkl'

if check_cached(lineage_cache) is None:
    print('Extracting lineages')
    extracted_nodes = set()
    for node in tqdm.tqdm(top_10_percent_nodes):
        extract_lineage(G, node, extracted_nodes)
    with tables.open_file(lineage_cache, mode='w') as f:
        pickle.dump(extracted_nodes, f, protocol=pickle.HIGHEST_PROTOCOL)
else:
    print('Loading lineages')
    with tables.open_file(lineage_cache, mode='r') as f:
        extracted_nodes = pickle.load(f)

In [None]:
filtered_cache = 'data/filtered.csv'

if check_cached(filtered_cache) is not None:
    print('Loading filtered data')
    with tables.open_file(filtered_cache, mode='r') as f:
        filtered_df = pd.read_csv(filtered_cache)
else:
    print('Creating filtered data')
    filtered_df = df[df['ID'].isin(extracted_nodes)]
    with tables.open_file(filtered_cache, mode='w') as f:
        filtered_df.write_csv(filtered_cache, index=False)