In [12]:
import numpy as np
import pandas as pd
import os
pd.set_option('display.max_colwidth', None)

In [2]:
networks_metadata_path = "../../data/networks/networks_metadata.csv"
networks_dir = "../../data/networks/all/"

In [4]:
networks_metadata = pd.read_csv(networks_metadata_path)
networks_metadata = networks_metadata.rename(columns={"duplicated_network": "mangal_duplicated_network"})

# report allegendly duplicated networks from mangal based on the data url

In [8]:
debug = networks_metadata.loc[(networks_metadata.mangal_duplicated_network == True) & (networks_metadata.data_url.str.contains("interactionweb", na=False)) | (networks_metadata.data_url.str.contains("web-of-life", na=False))][["processed_path", "Longitude", "Latitude", "data_url", "num_plants", "num_pollinators"]].sort_values(["num_plants", "num_pollinators"]).drop_duplicates()
debug

Unnamed: 0,processed_path,Longitude,Latitude,data_url,num_plants,num_pollinators
785,../../data/networks/all/binary/65.csv,124.8640,-2.160885,https://www.nceas.ucsb.edu/interactionweb/data...,2,3
773,../../data/networks/all/binary/53.csv,124.7780,1.787000,https://www.nceas.ucsb.edu/interactionweb/data...,2,4
772,../../data/networks/all/binary/52.csv,124.7040,1.616000,https://www.nceas.ucsb.edu/interactionweb/data...,3,2
783,../../data/networks/all/binary/63.csv,124.7350,1.608000,https://www.nceas.ucsb.edu/interactionweb/data...,3,3
784,../../data/networks/all/binary/64.csv,124.7930,1.767000,https://www.nceas.ucsb.edu/interactionweb/data...,3,4
...,...,...,...,...,...,...
790,../../data/networks/all/binary/70.csv,-84.8000,9.298777,http://www.web-of-life.es/map.php?type=5,169,40
351,../../data/networks/all/binarized_weighted/351...,-48.4069,-23.489215,http://www.web-of-life.es/map.php?type=5,198,60
1252,../../data/networks/all/weighted/351.csv,-48.4069,-23.489215,http://www.web-of-life.es/map.php?type=5,198,60
290,../../data/networks/all/binarized_weighted/290...,-89.8818,38.029652,https://www.nceas.ucsb.edu/interactionweb/html...,366,963


# detect duplications by first selecting suspects of duplicated networks based on networl size and then directly checking if the networks are indentical within each group of suspects

In [46]:
def is_equal_net(net1, net2):
    same_plants = set(net1.Plant.str.lower()) == set(net2.Plant.str.lower())
    same_pollinators = set([s.lower() for s in net1.columns if s != "Plant"]) == set([s.lower() for s in net2.columns if s != "Plant"])
    try:
        same_inter = np.all(net1.set_index("Plant").to_numpy() == net2.set_index("Plant"))
    except:
        same_inter = False
    return same_plants & same_pollinators & same_inter
    
    
def get_duplicated_clusters(network_paths):
    path_to_net = {path:pd.read_csv(path) for path in network_paths}
    for p in path_to_net:
        if "Unnamed: 0" in path_to_net[p]:
            path_to_net[p] = path_to_net[p].drop(["Unnamed: 0"], axis=1)
    dups = []
    for i in range(len(network_paths)):
        for j in range(i+1, len(network_paths)):
            if is_equal_net(path_to_net[network_paths[i]], path_to_net[network_paths[j]]):
                dups.append((network_paths[i], network_paths[j]))
    return dups
                


debug = networks_metadata.groupby(["network_type", "num_plants", "num_pollinators"])[["network_index", "path", "processed_path"]].agg({"network_index": lambda n: len(n),
                                                                                                                                                         "path": lambda n: list(n),
                                                                                                                                                         "processed_path": lambda n: list(n)}).reset_index()
debug["from_oik07303"] = debug.path.astype(str).str.contains("oik.07303", na=False)
debug["duplicated_networks"] = debug.processed_path.apply(get_duplicated_clusters)

In [56]:
debug.loc[debug.duplicated_networks.astype(str) != "[]"]
paths_to_remove = debug.loc[debug.duplicated_networks.astype(str) != "[]"].path.apply(lambda x: x[1:]).tolist()
paths_to_remove = [item for sublist in paths_to_remove for item in sublist]
paths_to_remove

['../../data/networks//literature/weighted/1_Aigrettes_processed.csv',
 '../../data/networks//mangal/weighted/934.csv',
 '../../data/networks//web_of_life/weighted/M_PL_011.csv',
 '../../data/networks//literature/binary/10.1055_s-0037-1602098_processed.csv',
 '../../data/networks//literature/binary/2_processed.csv',
 '../../data/networks//literature/weighted/1_Aigrettes_processed.csv',
 '../../data/networks//mangal/weighted/934.csv',
 '../../data/networks//web_of_life/weighted/M_PL_011.csv']

In [57]:
networks_metadata = networks_metadata.loc[~networks_metadata.path.isin(paths_to_remove)]

In [58]:
print(f"# remaining networks per type: ")
networks_metadata.groupby("network_type")["network_index"].count()

# remaining networks per type: 


network_type
binarized_weighted    716
binary                179
weighted              716
Name: network_index, dtype: int64

In [59]:
print(f"# remaining unique networks of all original types = {networks_metadata.loc[networks_metadata.network_type != 'binarized_weighted'].shape[0]:,}")

# remaining unique networks of all original types = 895


In [60]:
networks_metadata.to_csv(networks_metadata_path, index=False)

# less duplicates were detected compared to the ones reported as duplicates from mangal, so allegedly duplicated from mangal will be checked manuall

In [37]:
mangal_dups = networks_metadata.loc[(networks_metadata.mangal_duplicated_network == True) & (networks_metadata.data_url.str.contains("web-of-life", na=False)) & (networks_metadata.network_type == "weighted")][["processed_path", "EcoRegion_name", "num_plants", "num_pollinators"]].sort_values(["num_plants", "num_pollinators"]).drop_duplicates()
mangal_dups.iloc[:30]

Unnamed: 0,processed_path,EcoRegion_name,num_plants,num_pollinators
1174,../../data/networks/all/weighted/273.csv,seychelles moist forests,4,9
1345,../../data/networks/all/weighted/444.csv,seychelles moist forests,5,12
1357,../../data/networks/all/weighted/456.csv,seychelles moist forests,5,14
1376,../../data/networks/all/weighted/475.csv,seychelles moist forests,5,17
1295,../../data/networks/all/weighted/394.csv,seychelles moist forests,5,19
1377,../../data/networks/all/weighted/476.csv,seychelles moist forests,6,10
1202,../../data/networks/all/weighted/301.csv,seychelles moist forests,6,11
1337,../../data/networks/all/weighted/436.csv,seychelles moist forests,6,11
1350,../../data/networks/all/weighted/449.csv,seychelles moist forests,6,12
1383,../../data/networks/all/weighted/482.csv,seychelles moist forests,6,14


In [36]:
wof_nets = networks_metadata.loc[(networks_metadata.source == "web_of_life") & (networks_metadata.network_type == "weighted")][["processed_path", "EcoRegion_name", "num_plants", "num_pollinators"]].sort_values(["num_plants", "num_pollinators"]).drop_duplicates()
wof_nets.iloc[:30]

Unnamed: 0,processed_path,EcoRegion_name,num_plants,num_pollinators
1400,../../data/networks/all/weighted/499.csv,central forest-grasslands transition,7,33
1426,../../data/networks/all/weighted/525.csv,baltic mixed forests,8,42
1412,../../data/networks/all/weighted/511.csv,azores temperate mixed forests,10,12
1418,../../data/networks/all/weighted/517.csv,baltic mixed forests,10,40
1416,../../data/networks/all/weighted/515.csv,canary islands dry woodlands and forests,11,38
1393,../../data/networks/all/weighted/492.csv,galápagos islands scrubland mosaic,12,6
1423,../../data/networks/all/weighted/522.csv,mascarene forests,14,13
1390,../../data/networks/all/weighted/489.csv,canary islands dry woodlands and forests,14,35
1394,../../data/networks/all/weighted/493.csv,kalaallit nunaat low arctic tundra,15,39
1395,../../data/networks/all/weighted/494.csv,north atlantic moist mixed forests,16,44
