In [3]:
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities
from networkx.algorithms.community import modularity
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import os


In [4]:
mapping = {
    "Northeast Atlantic": "Atlantic Ocean",
    None: None,
    "Western Mediterranean": "Atlantic Ocean",
    "Atlantic Ocean": "Atlantic Ocean",
    "Lake Malawi": "Water Africa",
    "United States": "North America",
    "Wisconsin": "North America",
    "Zimbabwe": "Africa",
    "Yugoslavia": "Europe",
    "Western Europe": "Europe",
    "Sea of Okhotsk": "Pacific Ocean",
    "Canada": "North America",
    "Japan": "Asia",
    "Pacific Ocean": "Pacific Ocean",
    "Yukon + Northwest Territories": "North America",
    "Utah": "North America",
    "Ontario": "North America",
    "Mexico": "North America",
    "Lake Baikal": "Water Asia",
    "Ukraine, incl. Moldavia": "Europe",
    "Southwest Atlantic": "Atlantic Ocean",
    "Oceanic Islands": "Pacific Ocean",
    "Southwest Indian Ocean": "Indian Ocean",
    "Antarctic Ocean": "Southern Ocean",
    "Philippines": "Asia",
    "Southern S. America": "South America",
    "Queensland": "Australia",
    "Indian Ocean": "Indian Ocean",
    "Tyrrhenian Sea": "Atlantic Ocean",
    "Northern Indian Ocean": "Indian Ocean",
    "Australia": "Australia",
    "Norwegian Sea": "Atlantic Ocean",
    "Uruguay": "South America",
    "Puerto Rico": "North America",
    "Thailand": "Asia",
    "Red Sea": "Indian Ocean",
    "Persian Gulf": "Indian Ocean",
    "Southeast Atlantic": "Atlantic Ocean",
    "South China Sea": "Pacific Ocean",
    "Oriental": "Asia",
    "Pakistan": "Asia",
    "Western coast of India": "Asia",
    "Arctic Ocean": "Arctic Ocean",
    "Western coast of British Isles": "Europe",
    "West Coast of Norway": "Europe",
    "Russia (Asian)": "Asia",
    "South America": "South America",
    "Virginia": "North America",
    "Wyoming": "North America",
    "Armenia": "Asia",
    "Italy": "Europe",
    "Black Sea": "Atlantic Ocean",
    "Sea of Marmara": "Atlantic Ocean",
    "NW African coast": "Africa",
    "Yakut ASSR": "Asia",
    "Caribbean Sea": "Atlantic Ocean",
    "Venezuela": "South America",
    "Gulf of Mexico": "Atlantic Ocean",
    "Southern Indian Ocean": "Indian Ocean",
    "Caspian Sea": "Water Asia",
    "New Brunswick + Nova Scotia": "North America",
    "Finland": "Europe",
    "Poland": "Europe",
    "Lithuania": "Europe",
    "South Korea": "Asia",
    "North Sea": "Atlantic Ocean",
    "Georgia (USSR)": "Europe",
    "Uzbekistan": "Asia",
    "Tunisia": "Africa",
    "Washington": "North America",
    "Tasmania": "Australia",
    "South Australia": "Australia",
    "Victoria": "Australia",
    "Western Australia": "Australia",
    "New South Wales": "Australia",
    "Balearic Islands": "Europe",
    "Vietnam": "Asia",
    "Austria": "Europe",
    "Middle East + Asia Minor": "Asia",
    "Czechoslovakia": "Europe",
    "United Arab Emirates": "Asia",
    "Taiwan": "Asia",
    "Southeast Asia": "Asia",
    "Central America": "North America",
    "Cuba": "North America",
    "Antilles": "North America",
    "Bulgaria": "Europe",
    "Turkmenistan": "Asia",
    "Spain + Andalusia": "Europe",
    "Hungary": "Europe",
    "Manitoba": "North America",
    "Lake Superior": "Water North America",
    "West Virginia": "North America",
    "Lake Huron": "Water North America",
    "Saskatchewan": "North America",
    "Quebec": "North America",
    "Uganda + Burundi + Rwanda": "Africa",
    "Zambia": "Africa",
    "Sudan": "Africa",
    "Zaire": "Africa",
    "Central Africa": "Africa",
    "Egypt": "Africa",
    "Kenya": "Africa",
    "Costa Rica": "North America",
    "Nicaragua": "North America",
    "Chad": "Africa",
    "Ghana": "Africa",
    "Far East": "Asia",
    "Mongolia": "Asia",
    "Russia (European)": "Europe",
    "Tadzhikistan": "Asia",
    "Tanzania": "Africa",
    "Ethiopia (incl. Dhibouti)": "Africa",
    "Sea of Japan": "Pacific Ocean",
    "Finno-Karelian ASSR": "Europe",
    "Greece": "Europe",
    "South Sandwich Islands": "Antarctica",
    "South Georgia": "Antarctica",
    "Kazakstan": "Asia",
    "Kirgizia": "Asia",
    "Panama": "North America",
    "North Africa": "Africa",
    "Namibia": "Africa",
    "South Africa": "Africa",
    "New Zealand": "Australia",
    "Nigeria": "Africa",
    "Senegal + The Gambia + Guinea-Bassau": "Africa",
    "Sri Lanka": "Asia",
    "Papua New Guinea": "Australia",
    "Sardinia": "Europe",
    "Somalia": "Africa",
    "Turkey": "Asia",
    "Lake Tanganyika": "Water Africa",
    "Byelorussia": "Europe",
    "Lesser Antilles": "North America",
    "Dominican Republic + Haiti": "North America",
    "Iceland": "Europe",
    "Greenland": "North America",
    "Switzerland + Lichtenstein": "Europe",
    "Benin + Togo + Ghana": "Africa",
    "Azov Sea": "Atlantic Ocean",
    "Norway": "Europe",
    "Eastern Mediterranean": "Atlantic Ocean",
    "Azerbaidzan": "Asia",
    "Sabah": "Asia",
    "Lake Michigan": "Water North America",
    "Transvaal": "Africa",
    "Ivory Coast": "Africa",
    "Gabon + Congo + Equatorial Guinea": "Africa",
    "Surinam": "South America",
    "Sumatra": "Asia",
    "Togo": "Africa",
    "Sierra Leone": "Africa",
    "Society Islands (incl. Tahiti)": "Pacific Ocean",
    "Malawi": "Africa",
    "Saudi Arabia": "Asia",
    "Syria": "Asia",
    "Southern Yemen + Yemen": "Asia",
    "Mozambique": "Africa",
    "Spitzbergen (Svalbard)": "Europe",
    "Sweden": "Europe",
    "Nova Scotia": "North America",
    "Bahamas": "North America",
    "Haiti": "North America",
    "Cayman Islands": "North America",
    "Jamaica": "North America",
    "Mali + Burkina Faso": "Africa",
    "Botswana": "Africa",
    "Newfoundland": "North America",
    "Central African Republic": "Africa",
    "Honduras": "North America",
    "Guatemala": "North America",
    "Corsica": "Europe",
    "Hong Kong": "Asia",
    "Gibraltar": "Europe",
    "Guinea + Sierra Leone + Liberia": "Africa",
    "Samoa Islands": "Pacific Ocean",
    "Solomon Islands": "Pacific Ocean",
    "Seychelles": "Africa",
    "West Irian": "Asia",
    "Portugal": "Europe",
    "Labrador + Newfoundland": "North America",
    "Denmark": "Europe",
    "Liberia": "Africa",
    "The Gambia": "Africa",
}

#Update any keys with 'Water' to just the continent they reference
updated_mapping = {key: value.replace("Water ", "") if value and "Water " in value else value for key, value in mapping.items()}

mapping = updated_mapping




In [5]:
import pandas as pd

# Step 1: Read the Nodes Data
nodes_df = pd.read_csv('nodes.csv')  # Replace 'nodes.csv' with your actual file name

# Ensure 'is_host' is an integer (0 or 1)
nodes_df[' is_host'] = nodes_df[' is_host'].astype(int)

# Create mappings
node_is_host = nodes_df.set_index('# index')[' is_host'].to_dict()
node_locality = nodes_df.set_index('# index')[' locality'].to_dict()

# Step 2: Read the Edges Data
edges_df = pd.read_csv('edges.csv')  # Replace 'edges.csv' with your actual file name

# Step 3: Process the Edges
parasite_to_localities = {}

for idx, row in edges_df.iterrows():
    source = row['# source']
    target = row[' target']
    
    # Get 'is_host' status; default to None if not found
    source_is_host = node_is_host.get(source)
    target_is_host = node_is_host.get(target)
    
    # Skip if either node is not in the nodes data
    if source_is_host is None or target_is_host is None:
        continue
    
    # Check if one node is host and the other is parasite
    if source_is_host != target_is_host:
        if source_is_host == 1:  # Source is host
            host_node = source
            parasite_node = target
        else:  # Target is host
            host_node = target
            parasite_node = source
        
        # Get locality of the host
        host_locality = node_locality.get(host_node)
        if host_locality:
            # Initialize the set if not already done
            parasite_to_localities.setdefault(parasite_node, set()).add(host_locality)
    else:
        # Both nodes are hosts or both are parasites; skip
        continue

# Step 4: Create the Output Data
output_data = []
for parasite_node, localities in parasite_to_localities.items():
    output_data.append({
        'parasite_index': parasite_node,
        'connected_localities': ';'.join(localities)
    })

output_df = pd.DataFrame(output_data)

# Step 5: Save the Data to CSV
output_df.to_csv('parasite_localities.csv', index=False)


In [6]:
parasites_localities_df = pd.read_csv('parasite_localities.csv')  # Replace with your actual file name
import numpy as np
from collections import Counter

def map_localities(localities, mapping):
    # Split the localities by semicolon
    if pd.isna(localities):
        return None  # Handle NaN values
    localities_list = localities.split(';')
    
    # Map each locality using the mapping dictionary
    mapped_localities = [mapping.get(loc.strip(), loc) for loc in localities_list]
    
    # Count the occurrences of each locality
    locality_counts = Counter(mapped_localities)
    
    # Find the most common locality
    most_common_locality = locality_counts.most_common(1)[0][0] if locality_counts else None
    return most_common_locality

parasites_localities_df['connected_localities'] = parasites_localities_df['connected_localities'].apply(
    lambda x: map_localities(x, mapping)
)

# Step 4: Save the updated DataFrame to a new CSV file
parasites_localities_df.to_csv('parasites_localities_mapped.csv', index=False)

In [7]:
import pandas as pd
import networkx as nx

# Read edge list and create the graph
resource_all_df = pd.read_csv('resource_allocation_naive.csv')
resource_all_df.rename(columns={'src': 'source', 'trg': 'target', 'nij': 'weight'}, inplace=True)
G = nx.from_pandas_edgelist(resource_all_df, source='source', target='target', edge_attr='weight')

# Read parasites data
parasites = pd.read_csv('parasites_localities_mapped.csv')

# Ensure no NaN values in 'mapped_locality'
parasites['connected_localities'].fillna('Unknown', inplace=True)

# Create locality_dict
locality_dict = parasites.set_index('parasite_index')['connected_localities'].to_dict()

# Verify all graph nodes exist in locality_dict
unmatched_nodes = [node for node in G.nodes if node not in locality_dict]
if unmatched_nodes:
    print("Nodes in G but not in locality_dict:", unmatched_nodes)

# Add attributes to the graph nodes
nx.set_node_attributes(G, locality_dict, 'locality')

# Verify nodes with missing attributes
missing_attributes = [node for node, attributes in G.nodes(data=True) if 'locality' not in attributes or attributes['locality'] is None]
if missing_attributes:
    print("Nodes missing locality attribute:", missing_attributes)

# Print each node and its attributes
for node, attributes in G.nodes(data=True):
    print(f"Node {node}: {attributes}")

from networkx import attribute_assortativity_coefficient

edge_weights = nx.get_edge_attributes(G, 'weight')

# Filter edges with weights above the threshold
filtered_edges = [(u, v, w) for (u, v), w in edge_weights.items() if w >= 0.01745]

filtered_G = nx.Graph()  # Use nx.DiGraph() if the original graph is directed
filtered_G.add_weighted_edges_from(filtered_edges)    

# Transfer 'locality' attributes to filtered_G
for node in filtered_G.nodes:
    if node in G.nodes and 'locality' in G.nodes[node]:
        filtered_G.nodes[node]['locality'] = G.nodes[node]['locality']
    else:
        filtered_G.nodes[node]['locality'] = 'Unknown'  # Assign a default value if locality is missing

# Check locality diversity
localities = set(nx.get_node_attributes(filtered_G, 'locality').values())
print("Unique localities in filtered_G:", localities)

# Compute assortativity
if len(localities) > 1 and filtered_G.number_of_edges() > 0:
    result = attribute_assortativity_coefficient(filtered_G, 'locality')
    print("Assortativity coefficient for filtered_G:", result)
else:
    print("Cannot compute assortativity: Not enough diversity or edges.")






""" edge_weights = nx.get_edge_attributes(G, 'weight')

# Filter edges with weights above the threshold
filtered_edges = [(u, v, w) for (u, v), w in edge_weights.items() if w >= 0.01745]

filtered_G = nx.Graph()  # Use nx.DiGraph() if the original graph is directed
filtered_G.add_weighted_edges_from(filtered_edges)

from networkx import attribute_assortativity_coefficient

result = attribute_assortativity_coefficient(G, 'locality')

print(result) """


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  parasites['connected_localities'].fillna('Unknown', inplace=True)


Node 1: {'locality': 'Atlantic Ocean'}
Node 3301: {'locality': 'Pacific Ocean'}
Node 10918: {'locality': 'Atlantic Ocean'}
Node 16: {'locality': 'Atlantic Ocean'}
Node 54: {'locality': 'Europe'}
Node 58: {'locality': 'Europe'}
Node 72: {'locality': 'Atlantic Ocean'}
Node 89: {'locality': 'North America'}
Node 383: {'locality': 'North America'}
Node 454: {'locality': 'Europe'}
Node 503: {'locality': 'Europe'}
Node 517: {'locality': 'Europe'}
Node 647: {'locality': 'Europe'}
Node 2789: {'locality': 'Atlantic Ocean'}
Node 2881: {'locality': 'Europe'}
Node 2982: {'locality': 'Europe'}
Node 3067: {'locality': 'Atlantic Ocean'}
Node 3698: {'locality': 'Atlantic Ocean'}
Node 9506: {'locality': 'Europe'}
Node 9509: {'locality': 'Atlantic Ocean'}
Node 9518: {'locality': 'Atlantic Ocean'}
Node 9528: {'locality': 'Atlantic Ocean'}
Node 9559: {'locality': 'Europe'}
Node 9591: {'locality': 'Atlantic Ocean'}
Node 9730: {'locality': 'Atlantic Ocean'}
Node 10146: {'locality': 'Atlantic Ocean'}
Node 10

" edge_weights = nx.get_edge_attributes(G, 'weight')\n\n# Filter edges with weights above the threshold\nfiltered_edges = [(u, v, w) for (u, v), w in edge_weights.items() if w >= 0.01745]\n\nfiltered_G = nx.Graph()  # Use nx.DiGraph() if the original graph is directed\nfiltered_G.add_weighted_edges_from(filtered_edges)\n\nfrom networkx import attribute_assortativity_coefficient\n\nresult = attribute_assortativity_coefficient(G, 'locality')\n\nprint(result) "

In [8]:

from networkx.algorithms.community import greedy_modularity_communities
from sklearn.metrics import normalized_mutual_info_score
import numpy as np

communities = greedy_modularity_communities(filtered_G, weight=None)


# Detect communities
communities = list(greedy_modularity_communities(filtered_G, weight=None))

# Create a node-to-community mapping
community_mapping = {}
for i, community in enumerate(communities):
    for node in community:
        community_mapping[node] = i

# Assign each node to its community
nx.set_node_attributes(filtered_G, community_mapping, 'community')

# Extract attributes for mutual information calculation
node_list = list(filtered_G.nodes)
community_labels = [filtered_G.nodes[node]['community'] for node in node_list]
locality_labels = [filtered_G.nodes[node]['locality'] for node in node_list]

# Convert locality labels to numeric values
locality_to_numeric = {loc: i for i, loc in enumerate(set(locality_labels))}
numeric_locality_labels = [locality_to_numeric[loc] for loc in locality_labels]

# Compute normalized mutual information
nmi = normalized_mutual_info_score(numeric_locality_labels, community_labels)

print("Normalized Mutual Information between communities and localities:", nmi)


Normalized Mutual Information between communities and localities: 0.3445229781351275


In [13]:
from networkx.algorithms.community import modularity
result = modularity(filtered_G, communities)

print(result)

from sklearn.metrics import adjusted_mutual_info_score

# Calculate adjusted mutual information
ami = adjusted_mutual_info_score(numeric_locality_labels, community_labels)

print("Adjusted Mutual Information between communities and localities:", ami)



0.8145017750309694
Adjusted Mutual Information between communities and localities: 0.30716600805437827
