In [10]:
import os
import sys
import numpy as np
import networkx as nx
import pandas as pd
sys.path.append(os.path.join(os.getcwd(), 'data'))
from common_localities import locality_to_common_locality


nodes_df = pd.read_csv("../data/nodes.csv", index_col=0)
edge_df = pd.read_csv("../data/edges.csv")
parasites_df = nodes_df[nodes_df[" is_host"] == 0]


G = nx.Graph()
for id_parasite, parasite in parasites_df.iterrows():
    animals_idx = list(edge_df[edge_df[" target"] == id_parasite]["# source"]) + list(
        edge_df[edge_df["# source"] == id_parasite][" target"]
    )
    # so unfortunately, we have parasites group assigned to animals rows, and vice versa.
    # so, unfortunately, we don't know how they decided on this assignment.
    # also, unfortunately, as opposed to original data: there is no info on more groups!
    most_frequent_locality, most_frequent_parasite_group = nodes_df.loc[animals_idx][[' locality', ' group']].mode().loc[0]
    most_frequent_locality = locality_to_common_locality[most_frequent_locality]
    animals_group = parasite[' group']
    G.add_node(
        id_parasite,
        name=parasite[' name'],
        most_frequent_locality=most_frequent_locality,
        most_frequent_parasite_group=most_frequent_parasite_group,
        animals_group=animals_group
    )
    
# two steps:
# first create nods with attributes only and then connect them
 

In [12]:
import os

# Directory containing the CSV files
directory = '../data/trimmed_networks'

# Dictionary to store dataframes
dataframes = {}

# Iterate through the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dataframes[filename] = df

# Print the names of the loaded dataframes
print("Loaded dataframes:", list(dataframes.keys()))


# Create a copy of graph G

# Iterate through each dataframe in the dictionary
for filename, df in dataframes.items():
    G_copy = G.copy()
    # Iterate through each row in the dataframe
    for _, row in df.iterrows():
        src = row['src']
        trg = row['trg']
        # Add edge if src and trg are not the same (no self loop)
        if src != trg:
            G_copy.add_edge(src, trg)

    # Print the number of edges added
    print("Number of edges added:", G_copy.number_of_edges())
    name = filename.split('.')[0]
    nx.write_graphml(G_copy, '../data/trimmed_graphmls/'+name+'.graphml')

Loaded dataframes: ['1_simple_disparity_filter.csv', '2_simple_disparity_filter.csv', 'resource_allocation_naive.csv']
Number of edges added: 443701
Number of edges added: 336278
Number of edges added: 15588


In [14]:
G_copy.nodes(data=True)[1]

{'name': 'Acanthocephaloides incrassatus',
 'most_frequent_locality': 'Atlantic Ocean',
 'most_frequent_parasite_group': 'Trematoda',
 'animals_group': 'Teleostei'}