In [3]:
import pandas as pd
import numpy as np

In [4]:
networks_metadata_path = "../../data/networks/all/networks_metadata.csv"
parsed_geodata_path = "../../data/networks/all/networks_unparsed_geodata.csv"

varnames = {"BioClim_1": "Annual Mean Temperature",
"BioClim_2": "Mean Diurnal Range (Mean of monthly (max temp - min temp))",
"BioClim_3": "Isothermality (BioClim_2/BioClim_7) (×100)",
"BioClim_4": "Temperature Seasonality (standard deviation ×100)",
"BioClim_5": "Max Temperature of Warmest Month",
"BioClim_6": "Min Temperature of Coldest Month",
"BioClim_7": "Temperature Annual Range (BioClim_5-BioClim_6)",
"BioClim_8": "Mean Temperature of Wettest Quarter",
"BioClim_9": "Mean Temperature of Driest Quarter",
"BioClim_10": "Mean Temperature of Warmest Quarter",
"BioClim_11": "Mean Temperature of Coldest Quarter",
"BioClim_12": "Annual Precipitation",
"BioClim_13": "Precipitation of Wettest Month",
"BioClim_14": "Precipitation of Driest Month",
"BioClim_15": "Precipitation Seasonality (Coefficient of Variation)",
"BioClim_16": "Precipitation of Wettest Quarter",
"BioClim_17": "Precipitation of Driest Quarter",
"BioClim_18": "Precipitation of Warmest Quarter",
"BioClim_19": "Precipitation of Coldest Quarter",
"species_richness": "species_richness",
"EcoRegion_name": "EcoRegion_name"}

## parse data

In [61]:
networks_metadata = pd.read_csv(networks_metadata_path)
drop = [c for c in networks_metadata.columns if "Unnmaed" in c or c.startswith("geom") or c.startswith("X") or c in varnames.values()]
networks_metadata = networks_metadata[[c for c in networks_metadata.columns if c not in drop and not "Unnamed" in c]]

raw_geodata = pd.read_csv(parsed_geodata_path)

In [62]:
networks_metadata = networks_metadata.merge(raw_geodata[["Obs_ID"] + list(varnames.keys())], left_on="ID", right_on="Obs_ID", how="left")
networks_metadata = networks_metadata.rename(columns=varnames)

In [63]:
networks_metadata[["Longitude", "Latitude"]+list(varnames.values())].notna().sum() / networks_metadata.shape[0]

Longitude                                                     0.506289
Latitude                                                      0.506289
Annual Mean Temperature                                       0.410901
Mean Diurnal Range (Mean of monthly (max temp - min temp))    0.410901
Isothermality (BioClim_2/BioClim_7) (×100)                    0.410901
Temperature Seasonality (standard deviation ×100)             0.410901
Max Temperature of Warmest Month                              0.410901
Min Temperature of Coldest Month                              0.410901
Temperature Annual Range (BioClim_5-BioClim_6)                0.410901
Mean Temperature of Wettest Quarter                           0.410901
Mean Temperature of Driest Quarter                            0.410901
Mean Temperature of Warmest Quarter                           0.410901
Mean Temperature of Coldest Quarter                           0.410901
Annual Precipitation                                          0.410901
Precip

In [64]:
networks_metadata["coord"] = networks_metadata.apply(lambda rec: f"({np.round(rec.Longitude,3)},{np.round(rec.Latitude,3)})", axis=1)
coord_to_loc = networks_metadata.dropna(subset=["coord", "location"]).set_index("coord")["location"].to_dict()
networks_metadata.location = networks_metadata.apply(lambda rec: rec.location if pd.notna(rec.location) else coord_to_loc.get(rec.coord, np.nan), axis=1)
networks_metadata = networks_metadata.drop(["coord"], axis=1)

In [70]:
print(f"# netwokrs with no location = {networks_metadata.query('Longitude.isna() and Latitude.isna()').shape[0]:,}")
missing_ecoregions_data = networks_metadata.query("EcoRegion_name.isna() and Longitude.notna()")[["Longitude", "Latitude", "location", "EcoRegion_name"]]
print(f"# coordinates with no ecoregions but available location = {missing_ecoregions_data.shape[0]:,}")
missing_ecoregions_data = missing_ecoregions_data.drop_duplicates()
print(f"# coordinates with no ecoregions = {missing_ecoregions_data.shape[0]:,}")

# netwokrs with no location = 942
# coordinates with no ecoregions but available location = 182
# coordinates with no ecoregions = 17


In [71]:
networks_metadata.to_csv(networks_metadata_path)

# manually completement the ecoregions of samples with missing ecoregion and available coordinates

In [190]:
# manual coplementation
networks_metadata = pd.read_csv(networks_metadata_path)
networks_metadata.loc[(networks_metadata.Longitude >= 124) & (networks_metadata.Longitude <= 125) & (networks_metadata.Latitude >= 1) & (networks_metadata.Latitude <= 2), "location"] = "Desa/Dusun/Kelurahan Bunaken Kepulauan, Manado, Sulawesi Utara 95231, Bunaken Kepulauan Manado Indonesia"

In [192]:
# manual cxomplementation
location_to_ecoregion = {"Desa/Dusun/Kelurahan Bunaken Kepulauan, Manado, Sulawesi Utara 95231, Bunaken Kepulauan Manado Indonesia": "Wallacea",
                        "Morne Seychellois National Park, Mahé": "Seychelles moist forests",
                        "Black River Gorges National Park, Mauritius": "Madagascar and Indian Ocean Islands forests",
                        "Denmark": "North Atlantic moist mixed forests",
                        "Galapagos": "Galapagos Islands xeric scrub"}
networks_metadata.EcoRegion_name = networks_metadata.apply(lambda rec: rec.EcoRegion_name if pd.notna(rec.EcoRegion_name) else location_to_ecoregion.get(rec.location, np.nan), axis=1)

In [7]:
networks_metadata.loc[436, "location"] = "The western-most island of the Azores"
networks_metadata.loc[436, "EcoRegion_name"] = "Macaronesia forests"
networks_metadata.loc[1485, "location"] = "The western-most island of the Azores"
networks_metadata.loc[1485, "EcoRegion_name"] = "Macaronesia forests"

In [45]:
networks_metadata.loc[networks_metadata.location == 'Black River Gorges National Park, Mauritius', 'EcoRegion'] = 30120
networks_metadata.loc[networks_metadata.location == 'Black River Gorges National Park, Mauritius', 'EcoRegion_name'] = 'Mascarene forests'
networks_metadata.loc[networks_metadata.EcoRegion_name == "North Atlantic moist mixed forests", 'EcoRegion'] = 80429
networks_metadata.loc[networks_metadata.EcoRegion_name == 'Arctic tundra', 'EcoRegion'] = 51114

In [75]:
networks_metadata.set_index(["Longitude","Latitude"], inplace=True)
missing_ecoregions_data.set_index(["Longitude","Latitude"], inplace=True)
networks_metadata.fillna(missing_ecoregions_data.reindex(networks_metadata.index), inplace=True)
networks_metadata.reset_index(inplace=True)

In [79]:
print(f"# networks with no location = {networks_metadata.query('Longitude.isna() and Latitude.isna()').shape[0]:,}")
print(f"# coordinates with no ecoregions = {networks_metadata.query('EcoRegion_name.isna()').shape[0]:,}")

# netwokrs with no location = 942
# coordinates with no ecoregions = 959


In [56]:
networks_metadata[["Longitude", "Latitude", "EcoRegion"]+list(varnames.values())].notna().sum() / networks_metadata.shape[0]

Longitude                                                     0.506289
Latitude                                                      0.506289
EcoRegion                                                     0.500000
Annual Mean Temperature                                       0.493711
Mean Diurnal Range (Mean of monthly (max temp - min temp))    0.493711
Isothermality (BioClim_2/BioClim_7) (×100)                    0.493711
Temperature Seasonality (standard deviation ×100)             0.493711
Max Temperature of Warmest Month                              0.493711
Min Temperature of Coldest Month                              0.493711
Temperature Annual Range (BioClim_5-BioClim_6)                0.493711
Mean Temperature of Wettest Quarter                           0.493711
Mean Temperature of Driest Quarter                            0.493711
Mean Temperature of Warmest Quarter                           0.493711
Mean Temperature of Coldest Quarter                           0.493711
Annual

# report geo stats on networks included in the analysis only

In [72]:
networks_metadata = pd.read_csv(networks_metadata_path)
networks_metadata = networks_metadata[[c for c in networks_metadata.columns if "Unnmamed" not in c]]

max_missing_threshold = 0.5
min_classified_threshold = 5
min_poly_threhold = 2
min_di_threshold = 2

networks_metadata["included_in_analysis"] = networks_metadata.apply(lambda rec: rec.is_polyploid_missing_frac <= max_missing_threshold and 
                                                                    rec.num_classified >= min_classified_threshold and 
                                                                    rec.num_polyploids >= min_poly_threhold and 
                                                                    rec.num_diploids >= min_di_threshold, axis=1)

In [73]:
analyzed_networks_metadata = networks_metadata.query("included_in_analysis")
weighted_analyzed_networks_metadata = analyzed_networks_metadata.loc[analyzed_networks_metadata.network_type == 'weighted'] 
binary_analyzed_networks_metadata = analyzed_networks_metadata.loc[analyzed_networks_metadata.network_type != 'weighted']
print(f"# networks included in the binary analysis = {binary_analyzed_networks_metadata.shape[0]:,}")
print(f"# networks included in the weighted analysis = {weighted_analyzed_networks_metadata.shape[0]:,}")

# networks included in the binary analysis = 375
# networks included in the weighted analysis = 357


In [74]:
print(f"# weighted networks with no location = {weighted_analyzed_networks_metadata.query('Longitude.isna() and Latitude.isna()').shape[0]:,}")
print(f"# binary networks with no location = {binary_analyzed_networks_metadata.query('Longitude.isna() and Latitude.isna()').shape[0]:,}")
print(f"# coordinates with no ecoregions = {binary_analyzed_networks_metadata.query('Longitude.notna() and EcoRegion_name.isna()').shape[0]:,}")

# weighted networks with no location = 196
# binary networks with no location = 196
# coordinates with no ecoregions = 0


## unite parsed data to all networks metadata

In [None]:
networks_metadata = pd.read_csv(networks_metadata_path)
mangal_metadata = pd.read_csv(parsed_rmangal_metadata_path)

In [None]:
mangal_networks_metadata = networks_metadata.query("source == 'mangal'").merge(mangal_metadata, on="network_index", how="left")

In [None]:
mangal_networks_metadata = mangal_networks_metadata[[c for c in mangal_networks_metadata.columns if "Unnamed" not in c]]

In [None]:
for c in set(mangal_networks_metadata.columns)-set(networks_metadata.columns):
    networks_metadata[c] = np.nan

networks_metadata.set_index(["network_type", "network_index"], inplace=True)
mangal_networks_metadata.set_index(["network_type", "network_index"], inplace=True)
networks_metadata.update(mangal_networks_metadata)
networks_metadata = networks_metadata[[c for c in networks_metadata.columns if "Unnamed" not in c]]
networks_metadata.reset_index(inplace=True)

In [None]:
networks_metadata.to_csv(networks_metadata_path)