In [2]:
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities
from networkx.algorithms.community import modularity
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import os


In [71]:


directory = '/Users/sunechristiansen/sune/network_analysis/project/parasites_network_analysis/projections_with_backbonings/projection_parasites/resource_allocation_naive.csv'

In [80]:
df = pd.read_csv(directory)
df.rename(columns={'src': 'source', 'trg': 'target', 'nij': 'weight'}, inplace=True)
df.drop(columns=['Unnamed: 0'], inplace=True)

In [82]:
# Create the graph with weights
G = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr='weight')

Merging two dataframes:
- we use locality from nodes.csv and edges.csv
- groups we take from edges.csv and nodes.csv

First Part: extract most frequent localities from nodes and edges .csv

In [93]:
nodes_csv = pd.read_csv('../data/nodes.csv')
edges_csv = pd.read_csv('../data/edges.csv')
nodes_csv.head()

Unnamed: 0,# index,name,full_name,locality,group,is_host,_pos
0,0,Squalus blainville,"Proleptus robustus (van Beneden, 1871)",Northeast Atlantic,Nematoda,1,"array([32.67422356, 46.0460785 ])"
1,1,Acanthocephaloides incrassatus,,,Teleostei,0,"array([39.44431659, 25.20409242])"
2,2,Gobius bucchichi,Deretrema (Spinoderetrema) scorpaenicola Barto...,Western Mediterranean,Trematoda,1,"array([43.9399518 , 28.96108572])"
3,3,Acanthocephaloides propinquus,,,Teleostei,0,"array([37.69593368, 27.68332545])"
4,4,Gobius cruentatus,"Acanthocephaloides propinquus (Dujardin, 1845)",Atlantic Ocean,Acanthocephala,1,"array([30.49701772, 30.25827091])"


In [94]:

edges_csv.head()

Unnamed: 0,# source,target
0,0,1
1,0,1
2,0,3301
3,0,10918
4,0,10918


In [98]:
import sys
parasites_df = nodes_csv[nodes_csv[" is_host"] == 0].copy()

sys.path.append(os.path.join(os.getcwd(), '../', 'data'))
from common_localities import locality_to_common_locality
most_frequent_localities = []
for id_parasite, parasite in parasites_df.iterrows():
    animals_idx = list(edges_csv[edges_csv[" target"] == id_parasite]["# source"]) + list(
        edges_csv[edges_csv["# source"] == id_parasite][" target"]
    )
    # so unfortunately, we have parasites group assigned to animals rows, and vice versa.
    # so, unfortunately, we don't know how they decided on this assignment.
    # also, unfortunately, as opposed to original data: there is no info on more groups!
    most_frequent_locality = nodes_csv.loc[animals_idx][' locality'].mode().loc[0]
    most_frequent_locality = locality_to_common_locality[most_frequent_locality]
    most_frequent_localities.append(most_frequent_locality)

 

In [99]:
parasites_df.drop(columns=[' full_name', ' locality', ' group', ' is_host', ' _pos'], inplace=True)
parasites_df.rename(columns={"# index": "nodes_index"}, inplace=True)

In [103]:
parasites_df.head()

Unnamed: 0,nodes_index,name
1,1,Acanthocephaloides incrassatus
3,3,Acanthocephaloides propinquus
9,9,Acanthogyrus lizae
11,11,Neoechinorhynchus agilis
14,14,Rhadinorhynchus pristis


In [104]:
parasites_df['most_frequent_locality'] = most_frequent_localities
parasites_df

Unnamed: 0,nodes_index,name,most_frequent_locality
1,1,Acanthocephaloides incrassatus,Atlantic Ocean
3,3,Acanthocephaloides propinquus,Atlantic Ocean
9,9,Acanthogyrus lizae,Atlantic Ocean
11,11,Neoechinorhynchus agilis,Atlantic Ocean
14,14,Rhadinorhynchus pristis,Atlantic Ocean
...,...,...,...
30508,30508,Tamerlania meruli,Europe
30509,30509,Tanaisia elliptica,Europe
30513,30513,Cynodiplostomum namrui,Africa
30514,30514,Fasciola nyanzae,Africa


Second part: extract correct groups of parasties and most frequent groups of animals species from original data

In [87]:
original_data = pd.read_csv('../data/helminths.csv', encoding='ISO-8859-1')
original_data.head()

Unnamed: 0.1,Unnamed: 0,Host,Parasite,ParasiteFull,group,locality,hostgroup
0,2,Squalus blainville,Acanthocephaloides incrassatus,"Acanthocephaloides incrassatus (Molin, 1858)",Acanthocephala,Adriatic Sea,Chondrichthyes
1,3,Gobius bucchichi,Acanthocephaloides propinquus,"Acanthocephaloides propinquus (Dujardin, 1845)",Acanthocephala,Adriatic Sea,Teleostei
2,4,Gobius cruentatus,Acanthocephaloides propinquus,"Acanthocephaloides propinquus (Dujardin, 1845)",Acanthocephala,Adriatic Sea,Teleostei
3,5,Gobius niger,Acanthocephaloides propinquus,"Acanthocephaloides propinquus (Dujardin, 1845)",Acanthocephala,Adriatic Sea,Teleostei
4,6,Lesueurigobius friesii,Acanthocephaloides propinquus,"Acanthocephaloides propinquus (Dujardin, 1845)",Acanthocephala,Adriatic Sea,Teleostei


In [113]:
group_hostgroup_original_data = original_data.groupby(by=['Parasite', 'group'])['hostgroup'].agg(pd.Series.mode).to_frame().reset_index()
group_hostgroup_original_data.rename(columns={'hostgroup': 'most_frequent_hostgroup'}, inplace=True)
len(group_hostgroup_original_data)

18643

In [114]:
group_hostgroup_original_data.head()

Unnamed: 0,Parasite,group,most_frequent_hostgroup
0,Abbreviata abbreviata,Nematoda,Reptilia
1,Abbreviata achari,Nematoda,Reptilia
2,Abbreviata affinis,Nematoda,[]
3,Abbreviata anomala,Nematoda,Reptilia
4,Abbreviata antarctica,Nematoda,Reptilia


In [91]:
parasites_df

Unnamed: 0_level_0,name,most_frequent_locality
# index,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Acanthocephaloides incrassatus,Atlantic Ocean
3,Acanthocephaloides propinquus,Atlantic Ocean
9,Acanthogyrus lizae,Atlantic Ocean
11,Neoechinorhynchus agilis,Atlantic Ocean
14,Rhadinorhynchus pristis,Atlantic Ocean
...,...,...
30508,Tamerlania meruli,Europe
30509,Tanaisia elliptica,Europe
30513,Cynodiplostomum namrui,Africa
30514,Fasciola nyanzae,Africa


In [115]:
# Merge the DataFrames based on the conditions
merged_df = parasites_df.merge(
    group_hostgroup_original_data,
    left_on=[' name'],  # Columns in attribute_df
    right_on=['Parasite'],  # Columns in extra_attributes
    how='inner',  # Use 'inner' to keep only matching rows
)

# Display the merged DataFrame
merged_df.drop(columns=['Parasite'], inplace=True)
merged_df.head()

Unnamed: 0,nodes_index,name,most_frequent_locality,group,most_frequent_hostgroup
0,1,Acanthocephaloides incrassatus,Atlantic Ocean,Acanthocephala,Teleostei
1,3,Acanthocephaloides propinquus,Atlantic Ocean,Acanthocephala,Teleostei
2,9,Acanthogyrus lizae,Atlantic Ocean,Acanthocephala,Maxillopoda
3,11,Neoechinorhynchus agilis,Atlantic Ocean,Acanthocephala,Teleostei
4,14,Rhadinorhynchus pristis,Atlantic Ocean,Acanthocephala,Teleostei


In [116]:
merged_df.to_csv('../data/final_metadata.csv')

In [None]:

for i in range(len(files)):
    # Read the CSV file
    df = pd.read_csv('/Users/sunechristiansen/sune/network_analysis/project/parasites_network_analysis/projections_with_backbonings/' + files[i])

    # Rename columns
    df.rename(columns={'src': 'source', 'trg': 'target', 'nij': 'weight'}, inplace=True)

    # Create the graph with weights
    G = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr='weight')

    print(f'name: {files[i]}, edges: {len(G.edges(data=True))}, nodes: {len(G.nodes)}')


name: simple_disparity_filter.csv, edges: 598944, nodes: 17426
name: pearson_noise_corrected.csv, edges: 109730, nodes: 15369
name: hyperbolic_high_salience_skeleton.csv, edges: 598944, nodes: 17426
name: resource_allocation_naive.csv, edges: 598944, nodes: 17426
name: simple_noise_corrected.csv, edges: 598944, nodes: 17426


In [2]:
resource_all_df = pd.read_csv('resource_allocation_naive.csv')
# Rename columns
resource_all_df.rename(columns={'src': 'source', 'trg': 'target', 'nij': 'weight'}, inplace=True)

# Create the graph with weights
G = nx.from_pandas_edgelist(resource_all_df, source='source', target='target', edge_attr='weight')

In [None]:


""" # Find all connected components
connected_components = list(nx.connected_components(filtered_G))

# Find the largest connected component
largest_component = max(connected_components, key=len)

# Create a subgraph of the largest connected component
largest_subgraph = filtered_G.subgraph(largest_component) """

# Print details
print(f"Number of nodes in largest component: {len(largest_subgraph.nodes)}")
print(f"Number of edges in largest component: {len(largest_subgraph.edges)}")


Number of nodes in largest component: 15899
Number of edges in largest component: 438649


In [3]:
# Brute force find highest modularity resource allocation plus threshold:
for i in range(100):
    threshold = i/100
    # Get all edge weights
    edge_weights = nx.get_edge_attributes(G, 'weight')

    # Filter edges with weights above the threshold
    filtered_edges = [(u, v, w) for (u, v), w in edge_weights.items() if w >= threshold]

    # Create a new graph with the filtered edges
    filtered_G = nx.Graph()  # Use nx.DiGraph() if the original graph is directed
    filtered_G.add_weighted_edges_from(filtered_edges)

    communities = greedy_modularity_communities(filtered_G, weight=None)

    modularity_r = modularity(filtered_G, communities)
    print("threshold: ", threshold, " & modularity: ", modularity_r)
    

KeyboardInterrupt: 