In [31]:
import pandas as pd
import numpy as np

In [32]:
metadata_path_1 = "../../data/features/plant_features/metadata/Database_BioGeo_mating.csv"
metadata_path_2 = "../../data/features/plant_features/metadata/SpeciesTmData_Whitehead_etal.csv"
metadata_path_3 = "../../data/features/plant_features/metadata/Mixed mating outcrossing rate database.xls"
# metadata_path_4 = "../../data/features/plant_features/metadata/Dena_Grossenbacher_SI-SC_nph14534-sup-0002-tabless1-s3._with_counts.xlsx" # this is a union of referneces - but it has not been parsed yet and required OCR
metadata_path_4 = "../../data/features/plant_features/metadata/Outcrossing rates and floral display for species in Gleason and Cronquist.xls"

In [33]:
columns_map = {"cot": {1: "m.d.g", 2: np.nan, 3: "Monocot/Dicot/Gymno"},
               "life_form": {1: "life.history", 3: "Lform(0=ann,1=Hper,2=Wper)"}}             

In [35]:
def parse_metadata_1(path: str, columns_map: dict = columns_map) -> pd.DataFrame:
    df = pd.read_csv(path, encoding= 'unicode_escape')
    df["taxon"] = (df["genus"] + " " + df["species"]).str.lower().str.replace("_", " ")
    metadata_columns_naming = {columns_map[c][1]: c for c in columns_map}
    df = df.rename(columns=metadata_columns_naming)
    df = df.rename(columns={c: c.replace(".", "_").lower() for c in df.columns})
    df.biome = df.biome.replace({1: "Desert/arid scrub",
                                 2: "Temperate grasslands",
                                 3: "Shrubland/chaparral",
                                 4: "Taiga",
                                 5: "Tropical savannah",
                                 6: "Temperate deciduous forest and rainforest",
                                 7: "Tropical seasonal forest",
                                 8: "Tropical rainforest"})
    df.growth = df.growth.replace({1: "Herbaceous",
                                   2: "Vine",
                                   3: "Schrub",
                                   4: "Woody",
                                   5: "Varies"})
    df.life_form = df.life_form.replace({1: "Annual",
                                        2: "Biennial",
                                        3: "Semelparous perennial",
                                        4: "Iteroparous perennial",
                                        5: "Varies"})
    df.si = df.si.replace({0: "Self-incompatible",
                           1: "Self-compatible"})
    
    df = df[["taxon", "genus", "family", "cot", "latitude", 'hemisphere', 'biome', 'growth', 'life_form', 'si', 'mean_tm', 'references']]
    df["source"] = "Database_BioGeo_mating"
    return df

metadata1 = parse_metadata_1(metadata_path_1)

In [36]:
def parse_metadata_2(path: str, columns_map: dict = columns_map) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["taxon"] = df.species.str.lower().str.replace("_", " ")
    df = df[["taxon", "genus", "family", "mean_tm"]]
    df["source"] = "SpeciesTmData_Whitehead_etal"
    return df

metadata2 = parse_metadata_2(metadata_path_2)

In [37]:
def parse_metadata_3(path: str,  columns_map: dict = columns_map) -> pd.DataFrame:
    df = pd.read_excel(path)
    df["taxon"] = df["Genus species"].str.lower().str.replace("_", " ")
    metadata_columns_naming = {columns_map[c][3]: c for c in columns_map}
    df = df.rename(columns=metadata_columns_naming)
    df = df.rename(columns={c: c.replace(".", "_").replace("-","_").lower() for c in df.columns})
    df["life_form"] = df.life_form.replace({0: "Annual",
                                            1: "Herbaceous",
                                            2: "Woody"})
    df["pollination_mode"] = df["pollination(0=anim,1=wind,2=water)"].replace({0: "Animal",
                                                                                1: "Wind",
                                                                                2: "Water"})
    df["selfing_mode"] = df["mech of selfing"].replace({"auton": "Autonomous",
                                                      "geit": "Geitonogamous",
                                                      "none": "No selfing",
                                                      0: np.nan,
                                                      "cleis": "Cleistogamous",
                                                      'auton,geit': "Autonomous, Geitonogamous",
                                                      'facil, auton, geit': "Autonomous, Geitonogamous"})
    df = df[["taxon", "genus", "family", "cot", "life_form", "pollination_mode", "selfing_mode", "mean_tm"]]
    df["source"] = "Mixed mating outcrossing rate database"
    return df
    
metadata3 = parse_metadata_3(metadata_path_3)

In [42]:
def parse_metadata_4(path: str, columns_map: dict = columns_map) -> pd.DataFrame:
    df = pd.read_excel(path)
    df["taxon"] = df.species.str.lower().str.replace("_", " ")
    metadata_columns_naming = {columns_map[c][1]: c for c in columns_map}
    df = df.rename(columns={"outcrossing": "mean_tm", "fl size": "flower_size"})
    df = df[["taxon", "mean_tm", "flower_size"]]
    df["source"] = "Ferguson_2016"
    return df

metadata4 = parse_metadata_4(metadata_path_4)

In [43]:
complete_metadata = pd.concat([metadata1, metadata2, metadata3, metadata4])

In [46]:
categorigal_cols = ['genus', 'family', 'cot', 'hemisphere', 'biome', 'source', 'references', 'growth', 'life_form', 'si', 'pollination_mode', 'selfing_mode', 'flower_size']
agg_dict = {c:  lambda x: x.dropna().unique() if len(x.dropna().unique()) > 1 else (x.dropna().unique()[0] if len(x.dropna().unique()) == 1 else np.nan) for c in categorigal_cols}
agg_dict["mean_tm"] = np.mean
complete_metadata_by_taxon = complete_metadata.groupby("taxon").agg(agg_dict).reset_index()

In [47]:
complete_metadata_by_taxon.notna().sum() / complete_metadata_by_taxon.shape[0]

taxon               1.000000
genus               0.986945
family              0.986945
cot                 0.915144
hemisphere          0.626632
biome               0.480418
source              1.000000
references          0.532637
growth              0.642298
life_form           0.912533
si                  0.433420
pollination_mode    0.468668
selfing_mode        0.045692
flower_size         0.078329
mean_tm             1.000000
dtype: float64

In [48]:
complete_metadata_by_taxon.to_csv("../../data/features/plant_features/metadata/joined_metadata.csv")