In [5]:
import pandas as pd
import numpy as np

In [6]:
network_type_to_plant_features_paths = {"binary": ["../../data/_features/plant_features/binary/features.csv", "../../data/_features/plant_features/binary/hubbiness.csv"], 
                                        "weighted": ["../../data/_features/plant_features/weighted/features.csv", "../../data/_features/plant_features/weighted/hubbiness.csv"],
                                        "binarized_weighted": ["../../data/_features/plant_features/binarized_weighted/features.csv", "../../data/_features/plant_features/binarized_weighted/hubbiness.csv"]}
                        
metadata_features_path = "../../data/_features/plant_features/metadata/joined_metadata.csv"
taxonomic_features_path = "../../data/_features/taxonomic_features/taxonomic_features.csv"
name_resolution_path = "../..//data/name_resolution/resolved_plant_names.csv"

In [7]:
name_resolution = pd.read_csv(name_resolution_path)

In [8]:
metadata_features = pd.read_csv(metadata_features_path)
metadata_features.taxon = metadata_features.taxon.apply(lambda n: n[:-1] if pd.notna(n) and n.endswith(" ") else n)
metadata_features.genus = metadata_features.genus.apply(lambda n: n[:-1] if pd.notna(n) and n.endswith(" ") else n).str.lower()
metadata_features = metadata_features.drop("genus", axis=1)

In [9]:
taxonomic_features = pd.read_csv(taxonomic_features_path)
taxonomic_data_by_name = taxonomic_features.groupby("original_name")[["taxonomic_rank", "genus"]].agg({"taxonomic_rank": lambda x: x.dropna().unique()[0] if len(x.dropna()) > 0 else np.nan,
                                                                                                       "genus": lambda x: x.values[0]}).reset_index()

In [26]:
for network_type in network_type_to_plant_features_paths:
    plant_features = pd.read_csv(network_type_to_plant_features_paths[network_type][0]).rename(columns={"Plant": "original_name"})
    if "hubbiness_score" not in plant_features.columns:
        hubbiness_feature = pd.read_csv(network_type_to_plant_features_paths[network_type][1]).rename(columns={"plant": "original_name"}).drop(["network_type"], axis=1)
    plant_features = plant_features.merge(hubbiness_feature, on=["original_name", "network_id"], how="left")
    plant_features.original_name = plant_features.original_name.str.lower()
    if "taxonomic_rank" not in plant_features.columns:
        plant_features = plant_features.merge(taxonomic_data_by_name,
                                              on="original_name",
                                              how="left")
    
    if "resolved_name" not in plant_features.columns:
        plant_features = plant_features.merge(name_resolution[["original_name", "resolved_name"]], 
                                              on="original_name", 
                                              how="left")
    
    if not "mean_tm" in plant_features.columns:
        plant_features = plant_features.merge(metadata_features,
                                              left_on="original_name",
                                              right_on="taxon",
                                              how="left")

        plant_features.set_index("resolved_name", inplace=True)
        plant_features.update(metadata_features)
        plant_features.reset_index(inplace=True)
    
    cols = set(plant_features.columns)
    drop_cols = set([c for c in plant_features.columns if "Unnamed" in c or (c.startswith("hubbiness_score") and not c.endswith("_score"))])
    plant_features[cols-drop_cols].to_csv(network_type_to_plant_features_paths[network_type][0], index=False)

  plant_features[cols-drop_cols].to_csv(network_type_to_plant_features_paths[network_type][0], index=False)
  plant_features[cols-drop_cols].to_csv(network_type_to_plant_features_paths[network_type][0], index=False)
  plant_features[cols-drop_cols].to_csv(network_type_to_plant_features_paths[network_type][0], index=False)


In [28]:
metadata_features = metadata_features[[c for c in metadata_features.columns if "Unnamed" not in c]]

In [29]:
classified_plant_features = plant_features
classified_plant_features[metadata_features.columns].notna().sum()/classified_plant_features.shape[0] * 100

taxon               2.614841
family              2.614841
cot                 2.449941
hemisphere          2.367491
biome               1.319199
source              2.614841
references          2.167256
growth              2.438163
life_form           2.449941
si                  1.967020
pollination_mode    1.272085
selfing_mode        0.011779
flower_size         0.235571
mean_tm             2.614841
dtype: float64

In [30]:
missing = set(classified_plant_features.original_name)-set(metadata_features.taxon)
len(missing)

2311

In [31]:
list(missing)[:10]

['heracleum candicans',
 'ceiba aesculifolia',
 'lupinus lepidus confertus',
 'premna serratifolia',
 'alstroemeria aurea',
 'weigela decora',
 'dryas integrifolia',
 'dichorisandra thyrsiflora',
 'potentilla brevifolia',
 'scabiosa lucida']