In [101]:
import pandas as pd
import numpy as np
import os
import re

pd.set_option('display.max_colwidth', None)

from geopy.geocoders import Nominatim
from geopy.geocoders import GoogleV3

from time import sleep

In [102]:
networks_metadata_path = "../../data/networks/networks_metadata.csv"
parsed_geodata_path = "../../data/networks/all/networks_unparsed_geodata.csv"
bioclim_data_path = "../../data/networks/all/networks_bioclim_data.csv"

complementary_location_data_path = "../../data/networks/metadata/complementary_location_data.csv"
oik_db_path = "../../data/networks/metadata/OIK-07303_database.csv"
oik_loc_path = "../../data/networks/metadata/OIK-07303_original_studies.xlsx"
oik_indices_path = "../../data/networks/metadata/OIK_07303_network_indices.csv"
loc_to_ecoregion_for_mapping_path = "../../data/networks/metadata/networks_for_loc_to_ecoregion_mapping_pre_r.csv"
loc_to_ecoregion_for_complementing_path = "../../data/networks/metadata/networks_for_loc_to_ecoregion_mapping_post_r.csv"

ecoregion_ids_map_path = "../../data/metadata/network_level/ecoregions.csv"
ecoregion_metadata_path_1 = "../../data/metadata/network_level/ecoregions_metadata_rice_2019.xlsx"
ecoregion_metadata_path_2 = "../../data/metadata/network_level/ecoregions_polyfreq_rice_2019.xlsx"
ecoregion_metadata_path_3 = "../../data/metadata/network_level/ecoregions_data.csv"

varnames = {"BioClim_1": "AnnualMeanTemperature",
"BioClim_2": "MeanDiurnalTempRange",
"BioClim_3": "Isothermality",
"BioClim_4": "TemperatureSeasonality",
"BioClim_5": "MaxTemperatureofWarmestMonth",
"BioClim_6": "MinTemperatureofColdestMonth",
"BioClim_7": "TemperatureAnnualRange",
"BioClim_8": "MeanTemperatureofWettestQuarter",
"BioClim_9": "MeanTemperatureofDriestQuarter",
"BioClim_10": "MeanTemperatureofWarmestQuarter",
"BioClim_11": "MeanTemperatureofColdestQuarter",
"BioClim_12": "AnnualPrecipitation",
"BioClim_13": "PrecipitationofWettestMonth",
"BioClim_14": "PrecipitationofDriestMonth",
"BioClim_15": "PrecipitationSeasonality",
"BioClim_16": "PrecipitationofWettestQuarter",
"BioClim_17": "PrecipitationofDriestQuarter",
"BioClim_18": "PrecipitationofWarmestQuarter",
"BioClim_19": "PrecipitationofColdestQuarter",
"species_richness": "species_richness",
"EcoRegion_name": "EcoRegion_name",
"biome": "biome",
"area": "area"}

ecoregion_metadata_colnames_map = {"BIO1": "AnnualMeanTemperature",
"BIO2": "MeanDiurnalTempRange",
"BIO3": "Isothermality",
"BIO4": "TemperatureSeasonality",
"BIO5": "MaxTemperatureofWarmestMonth",
"BIO6": "MinTemperatureofColdestMonth",
"BIO7": "TemperatureAnnualRange",
"BIO8": "MeanTemperatureofWettestQuarter",
"BIO9": "MeanTemperatureofDriestQuarter",
"BIO10": "MeanTemperatureofWarmestQuarter",
"BIO11": "MeanTemperatureofColdestQuarter",
"BIO12": "AnnualPrecipitation",
"BIO13": "PrecipitationofWettestMonth",
"BIO14": "PrecipitationofDriestMonth",
"BIO15": "PrecipitationSeasonality",
"BIO16": "PrecipitationofWettestQuarter",
"BIO17": "PrecipitationofDriestQuarter",
"BIO18": "PrecipitationofWarmestQuarter",
"BIO19": "PrecipitationofColdestQuarter",
"Paleo1": "AnnualMeanTemperatureatLGM",
"Paleo2": "MeanDiurnalTempRangeatLGM",
"Paleo3": "IsothermalityatLGM",
"Paleo4": "TemperatureSeasonalityatLGM",
"Paleo5": "MaxTemperatureofWarmestMonthatLGM",
"Paleo6": "MinTemperatureofColdestMonthatLGM",
"Paleo7": "TemperatureAnnualRangeatLGM",
"Paleo8": "MeanTemperatureofWettestQuarteratLGM",
"Paleo9": "MeanTemperatureofDriestQuarteratLGM",
"Paleo10": "MeanTemperatureofWarmestQuarteratLGM",
"Paleo11": "MeanTemperatureofColdestQuarteratLGM",
"Paleo12": "AnnualPrecipitationatLGM",
"Paleo13": "PrecipitationofWettestMonthatLGM",
"Paleo14": "PrecipitationofDriestMonthatLGM",
"Paleo15": "PrecipitationSeasonalityatLGM",
"Paleo16": "PrecipitationofWettestQuarteratLGM",
"Paleo17": "PrecipitationofDriestQuarteratLGM",
"Paleo18": "PrecipitationofWarmestQuarteratLGM",
"Paleo19": "PrecipitationofColdestQuarter",
"Polyploid percent": "polyploid_percent"}

In [112]:
networks_metadata = pd.read_csv(networks_metadata_path)

# parse time data

In [429]:
# compte year of collection based on the reference - NOT NECESSALRITY ACCURATE
def get_ref_year(ref: str) -> int:
    if pd.isna(ref) or "10.1111/oik.07303" in ref:
        return np.nan
    year_matches = list(set([int(m) for m in re.findall("\d{4}", ref) if int(m) > 1900 and int(m) < 2023])) # assumption: there are no studies before 1900
    if len(year_matches) == 0:
        return np.nan
    elif len(year_matches) == 1:
        return year_matches[0]
    # print(f"multiple matches for {ref} = {year_matches}")
    return min(year_matches)
        
networks_metadata.loc[networks_metadata.reference_year.isna(), "reference_year"] = networks_metadata.reference.apply(get_ref_year)

# check for duplicated networks

In [467]:
def get_duplicated_networks(network_type: str, network_ids: list):
    duplicated_network_ids = []
    if len(network_ids) <= 1 or type(network_ids) != list:
        return duplicated_network_ids
    networks_id_to_path = networks_metadata.loc[(networks_metadata.network_type == network_type) & (networks_metadata.network_index.isin(network_ids))].set_index("network_index").processed_path.to_dict()
    network_id_to_content = {}
    for network_id in networks_id_to_path:
        net = pd.read_csv(networks_id_to_path[network_id])
        plants = set(net.Plant)
        pollinators = set(net.columns)-{"Plant"}
        network_id_to_content[network_id] = {"plants":plants, "pollinators":pollinators}
    for i in range(len(network_ids)):
        for j in range(i, len(network_ids)):
            n1, n2 = network_ids[i], network_ids[j] 
            if network_id_to_content[n1]["plants"] == network_id_to_content[n2]["plants"] and network_id_to_content[n1]["pollinators"] == network_id_to_content[n2]["pollinators"]:
                duplicated_network_ids.append((n1, n2))
    return duplicated_network_ids

# detect potentially duplicated networks by size
w_dup_candidates = networks_metadata.loc[(~networks_metadata.mangal_duplicate_from_another_db) & (networks_metadata.network_type == "weighted")].groupby(["num_plants", "num_pollinators"]).network_index.apply(lambda network_ids: get_duplicated_networks(network_type="weighted", network_ids=network_ids)).reset_index()
w_dup_candidates["has_dups"] = w_dup_candidates.network_index.apply(lambda n: len(n) > 0)
print(f"are there any weighted duplicated networks = {np.any(w_dup_candidates.has_dups)}")

b_dup_candidates = networks_metadata.loc[(~networks_metadata.mangal_duplicate_from_another_db) & (networks_metadata.network_type == "binary")].groupby(["num_plants", "num_pollinators"]).network_index.apply(lambda network_ids: get_duplicated_networks(network_type="binary", network_ids=network_ids)).reset_index()
b_dup_candidates["has_dups"] = b_dup_candidates.network_index.apply(lambda n: len(n) > 0)
print(f"are there any binary duplicated networks = {np.any(b_dup_candidates.has_dups)}")

are there any weighted duplicated networks = False
are there any binary duplicated networks = False


# parse existing data

In [63]:
drop = [c for c in networks_metadata.columns if "Unnamed" in c or c.startswith("geom") or c.startswith("X")]
networks_metadata = networks_metadata[[c for c in networks_metadata.columns if c not in drop and not "Unnamed" in c]]

raw_geodata = pd.read_csv(parsed_geodata_path).drop(["EcoRegion_name.y"], axis=1).rename(columns={"EcoRegion_name.x": "EcoRegion_name"}).rename(columns=varnames)
area_biome_score = (raw_geodata.groupby(["Obs_ID", "area", "biome"])[["area", "biome"]].size() / raw_geodata.groupby(["Obs_ID"])["area"].count())
raw_geodata["area_biome_score"] = raw_geodata.apply(lambda rec: area_biome_score.loc[(rec.Obs_ID, rec.area, rec.biome)], axis=1)
raw_geodata.sort_values(by=["area_biome_score"], ascending=[False], inplace=True)
raw_geodata.drop_duplicates(subset=list(set(raw_geodata.columns)-{"area", "area_biome_score", "biome", "bio_source", "EcoRegion_name"}), keep="first", inplace=True)

ecoregion_to_area = raw_geodata.set_index("EcoRegion")["area"].to_dict()
ecoregion_to_biome = raw_geodata.set_index("EcoRegion")["biome"].to_dict()

bioclim_data = pd.read_csv(bioclim_data_path).drop(['Unnamed: 0'], axis=1).rename(columns=varnames)
raw_geodata = pd.concat([raw_geodata, bioclim_data.loc[~bioclim_data.Obs_ID.isin(raw_geodata.Obs_ID)]])
all_rel_cols = set(raw_geodata.columns)-{"Obs_ID"}
raw_geodata = raw_geodata.dropna(subset=list(all_rel_cols), how="all")

In [64]:
networks_metadata.set_index("ID", inplace=True)
raw_geodata.set_index("Obs_ID", inplace=True)
networks_metadata.update(raw_geodata)
networks_metadata.reset_index(inplace=True)
raw_geodata.reset_index(inplace=True)

In [72]:
raw_geodata.loc[raw_geodata.EcoRegion_name.str.lower().str.contains("valdivian", na=False)].mean()

  raw_geodata.loc[raw_geodata.EcoRegion_name.str.lower().str.contains("valdivian", na=False)].mean()


Obs_ID                               1266.473684
AnnualMeanTemperature                   7.098685
MeanDiurnalTempRange                   12.172551
Isothermality                           2.209314
TemperatureSeasonality               1247.789474
MaxTemperatureofWarmestMonth          232.578947
MinTemperatureofColdestMonth           36.578947
TemperatureAnnualRange                 57.990111
MeanTemperatureofWettestQuarter       570.736842
MeanTemperatureofDriestQuarter        146.473684
MeanTemperatureofWarmestQuarter       146.473684
MeanTemperatureofColdestQuarter       536.789474
AnnualPrecipitation                    10.507885
PrecipitationofWettestMonth            49.663098
PrecipitationofDriestMonth            404.907848
PrecipitationSeasonality               19.227698
PrecipitationofWettestQuarter          -1.951184
PrecipitationofDriestQuarter           21.178882
PrecipitationofWarmestQuarter           2.772807
PrecipitationofColdestQuarter          12.172551
biome               

In [65]:
networks_metadata = networks_metadata.merge(raw_geodata[["Obs_ID"] + list(all_rel_cols-set(networks_metadata.columns))], left_on="ID", right_on="Obs_ID", how="left")
networks_metadata = networks_metadata.drop(["Obs_ID"], axis=1)

networks_metadata["area"] = networks_metadata.apply(lambda rec: rec.area if pd.notna(rec.area) else ecoregion_to_area.get(rec.EcoRegion, np.nan), axis=1)
networks_metadata["biome"] = networks_metadata.apply(lambda rec: rec.biome if pd.notna(rec.biome) else ecoregion_to_biome.get(rec.EcoRegion, np.nan), axis=1)

### manually complement missing relevant geodata

In [84]:
raw_geodata_by_ecoregion = raw_geodata.drop(["Obs_ID"], axis=1).groupby("EcoRegion_name").mean().reset_index()

In [98]:
networks_metadata.loc[networks_metadata.path.str.contains("vazquez_and_simberloff_2002_Llao_Llao"), "EcoRegion_name"] = "Valdivian temperate forests"
networks_metadata.loc[networks_metadata.path.str.contains("vazquez_and_simberloff_2002_Cerro_Lopez"), "EcoRegion_name"] = "Valdivian temperate forests"
networks_metadata.loc[networks_metadata.path.str.contains("vazquez_and_simberloff_2002_Mascardi"), "EcoRegion_name"] = "Valdivian temperate forests"
networks_metadata.loc[networks_metadata.path.str.contains("1_Llao-Llao"), "EcoRegion_name"] = "Valdivian temperate forests"
networks_metadata.loc[networks_metadata.path.str.contains("vazquez_and_simberloff_2002_Quetrihue"), "EcoRegion_name"] = "Valdivian temperate forests"
networks_metadata.loc[networks_metadata.path.str.contains("mangal/weighted/952.csv"), "EcoRegion_name"] = "Alto Paraná Atlantic forests"
networks_metadata.loc[networks_metadata.path.str.contains("1_Challhuaco"), "EcoRegion_name"] = "Valdivian temperate forests"
networks_metadata.loc[networks_metadata.path.str.contains("1_CerroOtto"), "EcoRegion_name"] = "Valdivian temperate forests"
#"vazquez_and_simberloff_2002_Safariland" - couldn't find the ecoregion
#"vazquez_and_simberloff_2002_Arroyo_Goye" - couldn't find the ecoregion
for c in set(raw_geodata_by_ecoregion.columns)-{"EcoRegion_name"}:
    d = raw_geodata_by_ecoregion.set_index("EcoRegion_name")[c].to_dict()
    networks_metadata.loc[(networks_metadata.EcoRegion_name.notna()) & (networks_metadata[c].isna()), c] = networks_metadata.EcoRegion_name.apply(lambda n: d.get(n, np.nan))

In [99]:
networks_metadata[["Longitude", "Latitude"]+list(set(varnames.values())-{"area", "biome"})].notna().sum() / networks_metadata.shape[0]

Longitude                          0.857232
Latitude                           0.945376
PrecipitationofWarmestQuarter      0.959032
MaxTemperatureofWarmestMonth       0.959032
AnnualPrecipitation                0.959032
Isothermality                      0.959032
MeanDiurnalTempRange               0.959032
EcoRegion_name                     0.949100
PrecipitationofDriestMonth         0.959032
MeanTemperatureofColdestQuarter    0.959032
PrecipitationofWettestQuarter      0.959032
MeanTemperatureofWarmestQuarter    0.959032
AnnualMeanTemperature              0.959032
PrecipitationofWettestMonth        0.959032
PrecipitationofDriestQuarter       0.959032
MeanTemperatureofDriestQuarter     0.959032
MinTemperatureofColdestMonth       0.959032
PrecipitationofColdestQuarter      0.959032
PrecipitationSeasonality           0.959032
MeanTemperatureofWettestQuarter    0.959032
TemperatureSeasonality             0.959032
species_richness                   0.947238
TemperatureAnnualRange          

In [64]:
networks_metadata["coord"] = networks_metadata.apply(lambda rec: f"({np.round(rec.Longitude,3)},{np.round(rec.Latitude,3)})", axis=1)
coord_to_loc = networks_metadata.dropna(subset=["coord", "location"]).set_index("coord")["location"].to_dict()
networks_metadata.location = networks_metadata.apply(lambda rec: rec.location if pd.notna(rec.location) else coord_to_loc.get(rec.coord, np.nan), axis=1)
networks_metadata = networks_metadata.drop(["coord"], axis=1)

In [70]:
print(f"# netwokrs with no location = {networks_metadata.query('Longitude.isna() and Latitude.isna()').shape[0]:,}")
missing_ecoregions_data = networks_metadata.query("EcoRegion_name.isna() and Longitude.notna()")[["Longitude", "Latitude", "location", "EcoRegion_name"]]
print(f"# coordinates with no ecoregions but available location = {missing_ecoregions_data.shape[0]:,}")
missing_ecoregions_data = missing_ecoregions_data.drop_duplicates()
print(f"# coordinates with no ecoregions = {missing_ecoregions_data.shape[0]:,}")

# netwokrs with no location = 942
# coordinates with no ecoregions but available location = 182
# coordinates with no ecoregions = 17


In [100]:
networks_metadata.to_csv(networks_metadata_path, index=False)

## manually completement the ecoregions of samples with missing ecoregion and available coordinates

In [190]:
# manual coplementation
networks_metadata = pd.read_csv(networks_metadata_path)
networks_metadata.loc[(networks_metadata.Longitude >= 124) & (networks_metadata.Longitude <= 125) & (networks_metadata.Latitude >= 1) & (networks_metadata.Latitude <= 2), "location"] = "Desa/Dusun/Kelurahan Bunaken Kepulauan, Manado, Sulawesi Utara 95231, Bunaken Kepulauan Manado Indonesia"

In [192]:
# manual cxomplementation
location_to_ecoregion = {"Desa/Dusun/Kelurahan Bunaken Kepulauan, Manado, Sulawesi Utara 95231, Bunaken Kepulauan Manado Indonesia": "Wallacea",
                        "Morne Seychellois National Park, Mahé": "Seychelles moist forests",
                        "Black River Gorges National Park, Mauritius": "Madagascar and Indian Ocean Islands forests",
                        "Denmark": "North Atlantic moist mixed forests",
                        "Galapagos": "Galapagos Islands xeric scrub"}
networks_metadata.EcoRegion_name = networks_metadata.apply(lambda rec: rec.EcoRegion_name if pd.notna(rec.EcoRegion_name) else location_to_ecoregion.get(rec.location, np.nan), axis=1)

In [7]:
networks_metadata.loc[436, "location"] = "The western-most island of the Azores"
networks_metadata.loc[436, "EcoRegion_name"] = "Macaronesia forests"
networks_metadata.loc[1485, "location"] = "The western-most island of the Azores"
networks_metadata.loc[1485, "EcoRegion_name"] = "Macaronesia forests"

In [45]:
networks_metadata.loc[networks_metadata.location == 'Black River Gorges National Park, Mauritius', 'EcoRegion'] = 30120
networks_metadata.loc[networks_metadata.location == 'Black River Gorges National Park, Mauritius', 'EcoRegion_name'] = 'Mascarene forests'
networks_metadata.loc[networks_metadata.EcoRegion_name == "North Atlantic moist mixed forests", 'EcoRegion'] = 80429
networks_metadata.loc[networks_metadata.EcoRegion_name == 'Arctic tundra', 'EcoRegion'] = 51114

In [75]:
networks_metadata.set_index(["Longitude","Latitude"], inplace=True)
missing_ecoregions_data.set_index(["Longitude","Latitude"], inplace=True)
networks_metadata.fillna(missing_ecoregions_data.reindex(networks_metadata.index), inplace=True)
networks_metadata.reset_index(inplace=True)

In [79]:
print(f"# networks with no location = {networks_metadata.query('Longitude.isna() and Latitude.isna()').shape[0]:,}")
print(f"# coordinates with no ecoregions = {networks_metadata.query('EcoRegion_name.isna()').shape[0]:,}")

# netwokrs with no location = 942
# coordinates with no ecoregions = 959


In [56]:
networks_metadata[["Longitude", "Latitude", "EcoRegion"]+list(varnames.values())].notna().sum() / networks_metadata.shape[0]

Longitude                                                     0.506289
Latitude                                                      0.506289
EcoRegion                                                     0.500000
Annual Mean Temperature                                       0.493711
Mean Diurnal Range (Mean of monthly (max temp - min temp))    0.493711
Isothermality (BioClim_2/BioClim_7) (×100)                    0.493711
Temperature Seasonality (standard deviation ×100)             0.493711
Max Temperature of Warmest Month                              0.493711
Min Temperature of Coldest Month                              0.493711
Temperature Annual Range (BioClim_5-BioClim_6)                0.493711
Mean Temperature of Wettest Quarter                           0.493711
Mean Temperature of Driest Quarter                            0.493711
Mean Temperature of Warmest Quarter                           0.493711
Mean Temperature of Coldest Quarter                           0.493711
Annual

## unite parsed data to all networks metadata

In [None]:
networks_metadata = pd.read_csv(networks_metadata_path)
mangal_metadata = pd.read_csv(parsed_rmangal_metadata_path)

In [None]:
mangal_networks_metadata = networks_metadata.query("source == 'mangal'").merge(mangal_metadata, on="network_index", how="left")

In [None]:
mangal_networks_metadata = mangal_networks_metadata[[c for c in mangal_networks_metadata.columns if "Unnamed" not in c]]

In [None]:
for c in set(mangal_networks_metadata.columns)-set(networks_metadata.columns):
    networks_metadata[c] = np.nan

networks_metadata.set_index(["network_type", "network_index"], inplace=True)
mangal_networks_metadata.set_index(["network_type", "network_index"], inplace=True)
networks_metadata.update(mangal_networks_metadata)
networks_metadata = networks_metadata[[c for c in networks_metadata.columns if "Unnamed" not in c]]
networks_metadata.reset_index(inplace=True)

In [None]:
networks_metadata.to_csv(networks_metadata_path)

In [4]:
networks_metadata_to_complement = pd.read_csv("/groups/itay_mayrose/halabikeren/continue_tomorrow.csv")

In [5]:
networks_metadata.set_index(["network_type", "network_index"], inplace=True)
networks_metadata_to_complement.set_index(["network_type", "network_index"], inplace=True)
networks_metadata.update(networks_metadata_to_complement)
networks_metadata.reset_index(inplace=True)

In [10]:
missing_locations = networks_metadata.query("location.notna() and Latitude.isna()").location.dropna().unique().tolist()
print(f"locations with no mapping to ecoregion or longitude / latitude = {len(missing_locations):,}")

locations with no mapping to ecoregion or longitude / latitude = 3


In [34]:
location_to_ecoregion = {'dolomite Ozark glades in Missouri, USA': "Central U.S. Interior Highlands",
                         'southern Germany within a radius of 50 km around the city of WÃ¼rzburg': "Central European Mixed Forests",
                         'Germany located in the Schorfheide-Chorin (Sch), Hainich-DÃ¼n (Hai), and SchwÃ¤bische Alb (Alb)': "Central European Mixed Forests",
                         'woodlands in Carlinville, Illinois': "Central USA Plains and Prairies",
                         'Gallatin National Forest, Montana, USA (45Â°142N, 110Â°332W) in the Thompson Creek (6979 acres burned in 1991) and Wicked Creek (22,195 acres burned in 2007) wildfires': "Northern Rockies",
                         'alpine tundra of the Beartooth Plateau located in the Custer and Shoshone National Forests of Carbon County, Montana and Park County, Wyoming, centered approximately at 45Â°002 N 109Â°252 W': "Rocky Mountain Subalpine Zone",
                         'Tyson Research Center (38.524718N, \x12 90.562494W), an 800-ha field station of Washington University of St. Louis that is located 40 km southwest from St. Louis, Missouri, USA': "Eastern Temperate Forests",
                         'Nothofagus dombeyi forest in and around Nahuel Huapi National Park, Argentina.': "Valdivian Rainforest"}
networks_metadata.loc[networks_metadata.EcoRegion_name.isna(), "EcoRegion_name"] = networks_metadata.location.apply(lambda loc: location_to_ecoregion.get(loc, np.nan))

doi_to_ecoregion = {"10.1111/ele.12821": "Central Valley", 
                    "10.1371/journal.pbio.0060031": "Valdivian Rainforest"}

networks_metadata.loc[networks_metadata.EcoRegion_name.isna(), "EcoRegion_name"] = networks_metadata.doi.apply(lambda doi: doi_to_ecoregion.get(doi, np.nan))

In [71]:
networks_metadata.to_csv(networks_metadata_path, index=False)

In [71]:
geolocator = Nominatim(user_agent="keren", timeout=5)

def get_coordinates(location: str)->list:
    location_coord = geolocator.geocode(location)
    sleep(2)
    if location_coord:
        return (location_coord.longitude, location_coord.latitude)   
    return (np.nan, np.nan)


sites = networks_metadata_to_complement.query("Latitude.isna()").site.dropna().unique().tolist()
print(f"# sites = {len(sites):,}")
site_to_coord = {site: get_coordinates(site) for site in sites}
networks_metadata_to_complement[["Longitude", "Latitude"]] = networks_metadata_to_complement.apply(lambda rec: site_to_coord.get(rec.site, (np.nan, np.nan)) if pd.notna(rec.site) else (np.nan, np.nan), axis=1, result_type="expand")

In [140]:
network_indices_map_path = "../../data/networks/metadata/OIK_07303_network_indices.csv"

def get_network(df: pd.DataFrame) -> pd.DataFrame:
    network = df.pivot_table(index='lower', columns='higher', values='freq', aggfunc='sum').fillna(0).reset_index().rename(columns={"lower": "Plant"}).set_index("Plant")
    return network
   

def get_net_index(rec: pd.Series) -> int:
    match = networks_metadata_to_complement.loc[networks_metadata_to_complement.network_name.str.startswith(f"{rec.study}_{rec.allSites}_{rec.sSite}"), "i"]
    if len(match) == 0:
        return np.nan
    return match.values[0]
        
    
if "net_oik_index" not in set(complementary_networks_geodata.columns) and not os.path.exists(network_indices_map_path):

    if not os.path.exists(network_indices_map_path):
        my_networks = {row.i: pd.read_csv(row.processed_path) for _, row in networks_metadata_to_complement.iterrows()}
        oik_networks_data = pd.read_csv(oik_db_path, encoding='latin-1').groupby(["study", "allSites", "sSite"])
        oik_networks = {n: get_network(oik_networks_data.get_group(n)) for n in oik_networks_data.groups.keys()}

        oik_net_name_to_i = {}
        for oik_net_name in oik_networks:
            oik_net = oik_networks[oik_net_name]
            for i in my_networks:
                net = my_networks[i].set_index("Plant")
                if set(oik_net.columns) == set(net.columns) and set(oik_net.index) == set(net.index):
                    oik_net_name_to_i[oik_net_name] = i

        oik_db = pd.read_csv(oik_db_path, encoding='latin-1')[["study", "allSites", "sSite"]].drop_duplicates()
        oik_db["net_oik_index"] = oik_db.apply(get_net_index, axis=1)
        oik_db.loc[oik_db.net_oik_index.isna(), "net_oik_index"] = oik_db.apply(lambda rec: oik_net_name_to_i.get((rec.study, rec.allSites, rec.sSite), np.nan), axis=1)
        oik_db = oik_db.dropna(subset=["net_oik_index"])
        oik_db.net_oik_index = oik_db.net_oik_index.astype(int)
        oik_db.to_csv("../../data/networks/metadata/OIK_07303_network_indices.csv")
    else:
        oik_db = pd.read_csv("../../data/networks/metadata/OIK_07303_network_indices.csv")
    
    if "net_oik_index" not in set(complementary_networks_geodata.columns):
        complementary_networks_geodata = pd.read_csv(complementary_location_data_path, encoding='latin-1')
        complementary_networks_geodata = complementary_networks_geodata.merge(oik_db, on=["study", "sSite"], how="outer")
        complementary_networks_geodata.to_csv(complementary_location_data_path, index=False)
    

In [229]:
complementary_networks_geodata = complementary_networks_geodata.dropna(subset=["net_oik_index"])
complementary_networks_geodata["network_name"] = complementary_networks_geodata.study + "_" + complementary_networks_geodata.allSites + "_" + complementary_networks_geodata.sSite + "_" + complementary_networks_geodata.net_oik_index.astype(int).astype(str)
complementary_networks_geodata["in_db"] = complementary_networks_geodata.network_name.apply(lambda x: x in set(networks_metadata_to_complement.network_name))
complementary_networks_geodata.loc[complementary_networks_geodata.in_db == False, "network_name"] = complementary_networks_geodata.net_oik_index.astype(int)
complementary_networks_geodata.to_csv(complementary_location_data_path, index=False)

In [238]:
net_name_to_ssite = complementary_networks_geodata.set_index("network_name")["sSite"].to_dict()
net_name_to_loc = complementary_networks_geodata.set_index("network_name")["location"].to_dict()
networks_metadata_to_complement.location = networks_metadata_to_complement.network_name.apply(lambda x: net_name_to_loc.get(x, np.nan))
networks_metadata_to_complement["site"] = networks_metadata_to_complement.network_name.apply(lambda x: net_name_to_ssite.get(x, np.nan))

# parse complementary data from Doi 10.1111/oik.07303

In [79]:
if not os.path.exists(loc_to_ecoregion_for_complementing_path):
    cols_translator = {"Data reference (DOI or URL)": "reference", "Country": "location", "approx.Latitude.of.study": "Latitude", "approx.Longitude.of.study": "Longitude"}
    oik_db = pd.read_csv(oik_db_path, encoding="latin1")[["study","cdate","sSite","allSites"]].drop_duplicates()
    oik_loc = pd.read_excel(oik_loc_path)
    oik_db = oik_db.merge(oik_loc, left_on="study", right_on="Study", how="left")
    oik_indices = pd.read_csv(oik_indices_path)
    oik_db = oik_db.merge(oik_indices, on=["study","allSites","sSite"], how="left").drop(["cdate"], axis=1).drop_duplicates()
    oik_db = oik_db.rename(columns=cols_translator)

    networks_metadata["net_oik_index"] =  networks_metadata.apply(lambda rec: np.nan if rec.doi != "10.1111/oik.07303" else int(rec.path.split("_")[-1].replace(".csv","")), axis=1)
    for c in cols_translator:
        c = cols_translator[c]
        d = oik_db.set_index("net_oik_index")[c].to_dict()
        networks_metadata[c] = networks_metadata.apply(lambda rec: rec[c] if pd.notna(rec[c]) else d.get(rec.net_oik_index, np.nan), axis=1)
    networks_metadata = networks_metadata.drop_duplicates(["network_type", "network_index"])
    
    df_for_mapping_loc_to_ecoregion = networks_metadata.loc[networks_metadata.Longitude.notna() & networks_metadata.Latitude.notna() & networks_metadata.EcoRegion_name.isna()]
    df_for_mapping_loc_to_ecoregion.to_csv(loc_to_ecoregion_for_mapping_path, index=False)

In [131]:
df_for_complementing_loc_to_ecoregion = pd.read_csv(loc_to_ecoregion_for_complementing_path)
comp = df_for_mapping_loc_to_ecoregion.drop(["EcoRegion", "EcoRegion_name"], axis=1).merge(df_for_complementing_loc_to_ecoregion, left_on="ID", right_on="Obs_ID", how="left")
comp.set_index(["network_type", "network_index"], inplace=True)
networks_metadata.set_index(["network_type", "network_index"], inplace=True)
joint_cols =  set(networks_metadata.columns)&set(comp.columns)
for c in joint_cols:
    networks_metadata[c].fillna(value=comp.dropna(subset=[c])[c].to_dict(), inplace=True)
networks_metadata.reset_index(inplace=True)

In [133]:
networks_metadata[["EcoRegion", "EcoRegion_name", "Longitude", "Latitude"]].notna().sum()

EcoRegion         1133
EcoRegion_name    1465
Longitude         1321
Latitude          1465
dtype: int64

In [134]:
networks_metadata.to_csv(networks_metadata_path, index=False)

# report geo stats on networks included in the analysis only

In [135]:
networks_metadata = pd.read_csv(networks_metadata_path)
networks_metadata = networks_metadata[[c for c in networks_metadata.columns if "Unnmamed" not in c]]

max_missing_threshold = 0.5
min_classified_threshold = 5
min_poly_threhold = 2
min_di_threshold = 2

networks_metadata["included_in_analysis"] = networks_metadata.apply(lambda rec: rec.is_polyploid_missing_frac <= max_missing_threshold and 
                                                                    rec.num_classified >= min_classified_threshold and 
                                                                    rec.num_polyploids >= min_poly_threhold and 
                                                                    rec.num_diploids >= min_di_threshold, axis=1)

In [136]:
analyzed_networks_metadata = networks_metadata.query("included_in_analysis")
weighted_analyzed_networks_metadata = analyzed_networks_metadata.loc[analyzed_networks_metadata.network_type == 'weighted'] 
binary_analyzed_networks_metadata = analyzed_networks_metadata.loc[analyzed_networks_metadata.network_type != 'weighted']
print(f"# networks included in the binary analysis = {binary_analyzed_networks_metadata.shape[0]:,}")
print(f"# networks included in the weighted analysis = {weighted_analyzed_networks_metadata.shape[0]:,}")

# networks included in the binary analysis = 305
# networks included in the weighted analysis = 291


In [137]:
print(f"# weighted networks with no location = {weighted_analyzed_networks_metadata.query('Longitude.isna() and Latitude.isna()').shape[0]:,}")
print(f"# binary networks with no location = {binary_analyzed_networks_metadata.query('Longitude.isna() and Latitude.isna()').shape[0]:,}")
print(f"# coordinates with no ecoregions = {binary_analyzed_networks_metadata.query('Longitude.notna() and EcoRegion_name.isna()').shape[0]:,}")

# weighted networks with no location = 2
# binary networks with no location = 2
# coordinates with no ecoregions = 0


In [138]:
networks_metadata.EcoRegion_name = networks_metadata.EcoRegion_name.str.lower()
eco_name_to_id = networks_metadata.set_index("EcoRegion_name")["EcoRegion"].to_dict()
networks_metadata.loc[networks_metadata.EcoRegion.isna(), "EcoRegion"] = networks_metadata.EcoRegion_name.apply(lambda x: eco_name_to_id.get(x, np.nan))
networks_metadata[["EcoRegion_name", "EcoRegion"]].notna().sum() /  networks_metadata.shape[0]  

EcoRegion_name    0.903763
EcoRegion         0.701419
dtype: float64

In [139]:
networks_metadata.to_csv(networks_metadata_path, index=False)

In [140]:
networks_metadata.loc[(networks_metadata.EcoRegion.isna()) & (networks_metadata.Longitude.notna())][["location", "EcoRegion_name", "EcoRegion",  "Longitude", "Latitude"]].EcoRegion_name.dropna().unique()

array(['peninsular malaysian rain forests', 'macaronesia forests',
       'north atlantic moist mixed forests',
       'galapagos islands xeric scrub',
       'central range montane rain forests', 'middle arctic tundra',
       'central usa plains and prairies',
       'central european mixed forests',
       'northern rockies conifer forests',
       'rocky mountain subalpine zone',
       'eastern australian temperate forests',
       'valdivian temperate forests'], dtype=object)

# merge by ecoregion with networks metadata

In [10]:
networks_metadata = pd.read_csv(networks_metadata_path)

In [16]:
ecoregion_df = pd.read_csv(ecoregion_ids_map_path).drop_duplicates("eco_names", keep="first")
ecoregion_df.eco_names = ecoregion_df.eco_names.str.lower() 
eco_id_to_name = ecoregion_df.set_index("eco_ids")["eco_names"].to_dict()
eco_name_to_id = ecoregion_df.set_index("eco_names")["eco_ids"].to_dict()

ecoregion_metadata_1 = pd.read_excel(ecoregion_metadata_path_1, header=6).rename(columns=ecoregion_metadata_colnames_map)
ecoregion_metadata_1["ECO_NAME"] = ecoregion_metadata_1['ECO ID'].apply(lambda n: eco_id_to_name.get(n, np.nan))
ecoregion_metadata_1.set_index("ECO NAME", inplace=True)

ecoregion_metadata_2 = pd.read_excel(ecoregion_metadata_path_2, header=1)
ecoregion_metadata_2["Ecoregion name"] = ecoregion_metadata_2["Ecoregion name"].str.lower() 

ecoregion_metadata_2.set_index("Ecoregion name", inplace=True)

In [17]:
ecoregionsd_net_name_to_format_name = {"valdivian rainforest": "valdivian temperate forests",
                                       "central valley": "california central valley grasslands",
                                       "peninsular malaysian rain forests": "peninsular malaysian rain forests",
                                       "north atlantic moist mixed forests": "north atlantic moist mixed forests",
                                         'central range montane rain forests': "central range montane rain forests",
                                         'central range papuan montane rain forests': "central range papuan montane rain forests",
                                         'arctic tundra': "middle arctic tundra",
                                         'central u.s. interior highlands': "central us forest-grasslands transition",
                                         'central european mixed forests': "central european mixed forests",
                                         'northern rockies': "northern rockies conifer forests",
                                         'eastern temperate forests': "eastern australian temperate forests"}
networks_metadata.EcoRegion_name = networks_metadata.EcoRegion_name.replace(ecoregionsd_net_name_to_format_name)
networks_metadata.EcoRegion = networks_metadata.EcoRegion.replace({'nan': np.nan}).apply(lambda n: str(int(n)) if pd.notna(n) else n) 
networks_metadata.loc[networks_metadata.EcoRegion.isna(), "EcoRegion"] = networks_metadata.EcoRegion_name.apply(lambda n: eco_name_to_id.get(n,np.nan))
networks_metadata.EcoRegion_name = networks_metadata.EcoRegion_name.str.lower()

In [18]:
networks_metadata.set_index("EcoRegion_name", inplace=True)
networks_metadata.update(ecoregion_metadata_1)
networks_metadata.update(ecoregion_metadata_2)
networks_metadata.reset_index(inplace=True)

In [19]:
networks_metadata.to_csv(networks_metadata_path, index=False)

In [20]:
max_missing_threshold = 0.5
min_classified_threshold = 5
min_poly_threhold = 0
min_di_threshold = 2

networks_metadata = pd.read_csv(networks_metadata_path)
networks_metadata["inlcuded_in_analysis"] = networks_metadata.apply(lambda rec: rec.is_polyploid_missing_frac <= max_missing_threshold and rec.num_classified > min_classified_threshold and rec.num_polyploids >= min_poly_threhold and rec.num_diploids >= min_di_threshold, axis=1)

In [21]:
networks_metadata[["EcoRegion_name", "EcoRegion", "Longitude", "Latitude","Annual Mean Temperature", "Climate_PC1", "Climate_PC2", "Climate_PC3","network_size", "Species richness", 'Perennial herb %', "Polyploid percent","highest_pollinator_rank_index"]].notna().sum()

EcoRegion_name                   1465
EcoRegion                        1452
Longitude                        1321
Latitude                         1465
Annual Mean Temperature          1331
Climate_PC1                      1312
Climate_PC2                      1312
Climate_PC3                      1312
network_size                     1621
Species richness                 1312
Perennial herb %                 1312
Polyploid percent                1312
highest_pollinator_rank_index    1531
dtype: int64

In [22]:
networks_metadata.query("included_in_analysis and network_type != 'binarized_weighted'")[["EcoRegion_name", "EcoRegion", "Longitude", "Latitude","Climate_PC1", "Climate_PC2", "Climate_PC3","network_size", "Species richness", 'Perennial herb %', "Polyploid percent","highest_pollinator_rank_index"]].notna().sum()

EcoRegion_name                   303
EcoRegion                        302
Longitude                        288
Latitude                         303
Climate_PC1                      276
Climate_PC2                      276
Climate_PC3                      276
network_size                     305
Species richness                 276
Perennial herb %                 276
Polyploid percent                276
highest_pollinator_rank_index    305
dtype: int64

In [24]:
networks_metadata.loc[networks_metadata['species_richness'].isna(), "species_richness"] = networks_metadata['Species richness']
networks_metadata = networks_metadata.drop(["Species richness"], axis=1)

## complement missing biome and area based on ecoregion, with a flag of low_accuracy

In [11]:
ecoregions_area_data = pd.read_csv(ecoregion_metadata_path_3).drop(["Unnamed: 0"], axis=1).sort_values(["ECO_ID", "BIOME"]).drop_duplicates(["ECO_ID"]).set_index("ECO_ID")
ecoregion_to_area = ecoregions_area_data.AREA.to_dict() 
ecoregion_to_biome = ecoregions_area_data.BIOME.to_dict() 
networks_metadata["low_area_accuracy"] = 0
networks_metadata.loc[networks_metadata.area.isna() & networks_metadata.EcoRegion.notna(), "low_area_accuracy"] = 1
networks_metadata.loc[networks_metadata.area.isna() & networks_metadata.EcoRegion.notna(), "area"] = networks_metadata.loc[networks_metadata.area.isna() & networks_metadata.EcoRegion.notna(), "EcoRegion"].apply(lambda e: ecoregion_to_area.get(int(e), np.nan))
networks_metadata.loc[networks_metadata.area.isna() & networks_metadata.EcoRegion.notna(), "biome"] = networks_metadata.loc[networks_metadata.area.isna() & networks_metadata.EcoRegion.notna(), "EcoRegion"].apply(lambda e: ecoregion_to_biome.get(int(e), np.nan))

In [13]:
networks_metadata.to_csv(networks_metadata_path, index=False)