In [1]:
import os
import pandas as pd
import numpy as np
import re
from typing import Tuple, Optional

In [2]:
networks_dir = "/groups/itay_mayrose/halabikeren/plant_pollinator_networks/networks/web_of_life/"
exotic_species_path = "../raw_CSVs/exotic_species.csv"
processed_networks_dir = "/groups/itay_mayrose/halabikeren/tmp/plant_pollinator_inter/Stouffer2014/raw_CSVs/networks/"

In [3]:
exotic_species = pd.read_csv(exotic_species_path)
def get_split(record: pd.Series) -> Tuple[str, str]:
    row = record.values[0]
    row_components = row.split(" ")
    return " ".join(row_components[:2]), " ".join(row_components[2:])
    
exotic_species[["species", "location"]] = exotic_species.apply(get_split, axis=1, result_type="expand")
exotic_species.drop("Species", inplace=True, axis=1)
exotic_species['location'] = exotic_species['location'].apply(lambda loc: " ".join(loc.split(" ")[:-1]).lower().split("(")[0].replace("islands", ""))
exotic_species.species = exotic_species.species.str.lower()

In [4]:
exotic_species_names = list(exotic_species.species.unique())

In [5]:
exotic_species_to_location = exotic_species.set_index("species")["location"].to_dict()

In [6]:
def correction_location(location: str):
    correction = {"UK": "United Kingdom", "USA": "United States", "England": "United Kingdom"}
    if location in correction:
        return correction[location]
    return location

network_metadata = pd.concat([pd.read_csv(f"{networks_dir}/binary/references.csv"), 
                              pd.read_csv(f"{networks_dir}/weighted/references.csv")])
network_metadata["location"] = network_metadata['Locality of Study'].apply(lambda row: correction_location(row.split(",")[-1].strip()).lower()) 

In [7]:
network_metadata.head()

Unnamed: 0,ID,Species,Interactions,Connectance,Type of interactions,Type of data,Reference,Locality of Study,Latitude,Longitude,location
0,M_PL_004,114,167,,Pollination,1,"Barrett, S. C. H., and K. Helenurm. 1987. The ...","Central New Brunswick, Canada",46.553731,-66.071245,canada
1,M_PL_006,78,146,,Pollination,1,"Dicks, LV, Corbet, SA and Pywell, RF 2002. Com...","Hickling, Norfolk, UK",52.762395,1.575532,united kingdom
2,M_PL_007,52,85,,Pollination,1,"Dicks, LV, Corbet, SA and Pywell, RF 2002. Com...","Shelfanger, Norfolk, UK",52.413173,1.097873,united kingdom
3,M_PL_013,65,103,,Pollination,1,"Ollerton, J., S. D. Johnson, L. Cranmer, and S...","KwaZulu-Natal region, South Africa",-29.616667,30.133333,south africa
4,M_PL_017,104,299,,Pollination,1,Memmott J. 1999. The structure of a plant-poll...,"Bristol, England",51.574994,-2.589902,united kingdom


In [8]:
network_id_to_location = network_metadata.set_index("ID")["location"].to_dict()

In [9]:
def is_exotic(name: str, network_location: Optional[str], species_location: Optional[str]) -> bool:
    if pd.isna(network_location):
        # print(f"error! network location is missing")
        return False
    if pd.isna(species_location):
        # print(f"species {name} is not exotic anywhere")
        return False
    # print(f"network_location={network_location}, species_location={species_location}")
    res = True if species_location in network_location else False
    return res
    

network_index_regex = re.compile("M_PL_(\d*)_*\d*.csv")
networks_paths = [f"{networks_dir}/binary/{path}" for path in os.listdir(f"{networks_dir}/binary") if path.startswith("M_PL") and int(network_index_regex.search(path).group(1)) <= 59] + \
                 [f"{networks_dir}/weighted/{path}" for path in os.listdir(f"{networks_dir}/weighted") if path.startswith("M_PL") and int(network_index_regex.search(path).group(1)) <= 59]
networks = {os.path.basename(path): pd.read_csv(path).rename(columns={"Unnamed: 0": "plant_name"}) for path in networks_paths}
for filename in networks:
    networks[filename].plant_name = networks[filename].plant_name.str.lower()
    if np.any(networks[filename].plant_name.isin(exotic_species_names)):
        print(f"exotic species are present in newtwork {filename}")
    networks[filename]["is_exotic"] = networks[filename].plant_name.apply(lambda name: is_exotic(name, network_id_to_location.get(filename.replace(".csv",""), np.nan), exotic_species_to_location.get(name, np.nan)))
    networks[filename].to_csv(f"{processed_networks_dir}{filename}")

exotic species are present in newtwork M_PL_044.csv
exotic species are present in newtwork M_PL_055.csv
exotic species are present in newtwork M_PL_057.csv
exotic species are present in newtwork M_PL_054.csv
exotic species are present in newtwork M_PL_019.csv
exotic species are present in newtwork M_PL_006.csv
exotic species are present in newtwork M_PL_007.csv
exotic species are present in newtwork M_PL_017.csv
exotic species are present in newtwork M_PL_058.csv
exotic species are present in newtwork M_PL_047.csv
exotic species are present in newtwork M_PL_042.csv
exotic species are present in newtwork M_PL_046.csv
exotic species are present in newtwork M_PL_043.csv
exotic species are present in newtwork M_PL_026.csv
exotic species are present in newtwork M_PL_018.csv
exotic species are present in newtwork M_PL_021.csv
exotic species are present in newtwork M_PL_036.csv
exotic species are present in newtwork M_PL_037.csv
exotic species are present in newtwork M_PL_005.csv
exotic speci

In [10]:
networks_with_exotic_species = [network_id for network_id in networks if np.any(networks[network_id]["is_exotic"])]
len(networks_with_exotic_species)

17

In [31]:
network_to_class = {network: np.any(networks[network]["is_exotic"]) for network in networks}
network_classification = pd.DataFrame.from_dict(network_to_class, orient="index").reset_index().rename(columns={"index": "network", 0: "is_exotic"})
network_classification.to_csv("network_classification.csv", index=False)