In [4]:
import pandas as pd
import numpy as np
from ete3 import Tree
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
do_weighted = True

relevant_features = ["partner.diversity", # shanon diversity
                     "d", # speciality index d',
                     "weighted.betweenness", # centrality,
                     "proportional.generality"] # generality

allmb_tree_path = "../../../data/trees/ALLMB.tre"
networks_dir = "../../../data/networks/all/"
species_features_paths = {"binary": "../../../data/features/plant_features/binary/features_with_classification.csv",
                          "binarized_weighted": "../../../data/features/plant_features/binarized_weighted/features_with_classification.csv",
                          "weighted": "../../../data/features/plant_features/weighted/features_with_classification.csv"}
classification_path = "../../../data/ploidy_classification/plant_classification.csv"
ploidb_path = "../../../data/ploidy_classification/ploidb_by_genus_without_missing.csv"

processed_data_path = f"../../../data/statistical_analysis/species_level/processed_features_with_ploidy_classification_on_{'un' if not do_weighted else ''}weighted_networks.csv"
processed_tree_path = f"../../../data/statistical_analysis/species_level/species_tree_on_{'un' if not do_weighted else ''}weighted_networks.nwk"
species_list_path = f"../../../data/statistical_analysis/species_level/species_list_{'un' if not do_weighted else ''}weighted_analysis.csv"

# process data

In [6]:
allmb_tree = Tree(allmb_tree_path, format=1)
for leaf in allmb_tree.get_leaves():
    leaf.name = leaf.name.lower().replace("_", " ")

In [7]:
relevant_columns = ["original_name", "network"] + relevant_features
weighted_data = pd.read_csv(species_features_paths["weighted"])[relevant_columns]
weighted_data["network_type"] = "weighted"
binarized_weighted_data = pd.read_csv(species_features_paths["binarized_weighted"])[relevant_columns]
binarized_weighted_data["network_type"] = "binarized_weighted"
binary_data = pd.read_csv(species_features_paths["weighted"])[relevant_columns]
binary_data["network_data"] = "binary"               
features = pd.concat([weighted_data,binarized_weighted_data,binary_data])
features.network = features.network.str.replace(".csv", "").astype(int)

  features.network = features.network.str.replace(".csv", "").astype(int)


In [8]:
classification = pd.read_csv(classification_path).rename(columns={"conservative_is_polyploid_by_resolved": "is_polyploid"})

In [9]:
features_with_classification = features.merge(classification[["original_name", "resolved_name", "is_polyploid"]], on="original_name", how="left")
features_with_classification.set_index("original_name", inplace=True)
features_with_classification["is_polyploid"].fillna(classification.set_index("original_name")["conservative_is_polyploid_by_original"].to_dict(), inplace=True)
features_with_classification.reset_index(inplace=True)

In [10]:
features_with_classification.groupby("is_polyploid", dropna=False)["original_name"].count() / features_with_classification.shape[0]*100

is_polyploid
0.0    25.201146
1.0    22.928686
NaN    51.870168
Name: original_name, dtype: float64

In [11]:
clean_features_with_classification = features_with_classification.dropna(subset=relevant_features+["is_polyploid"], how="any", axis=0)

In [12]:
clean_features_with_classification.groupby("is_polyploid", dropna=False)["original_name"].count() / clean_features_with_classification.shape[0]*100

is_polyploid
0.0    52.403139
1.0    47.596861
Name: original_name, dtype: float64

In [13]:
# missing_classifications = features_with_classification.loc[features_with_classification.is_polyploid.isna()].original_name.unique().tolist()

In [14]:
# # ask itay: maybe if we use less strict thresholds we will get more classifications
# ploidb = pd.read_csv(ploidb_path)
# ploidb.loc[ploidb.Taxon.isin(missing_classifications)][["Taxon", "Ploidy inference", "Ploidy transitions frequency"]]

In [15]:
print(f"# species from networks = {len(clean_features_with_classification.original_name.unique()):,}")
print(f"# species in tree = {len(allmb_tree.get_leaf_names()):,}")

# species from networks = 1,188
# species in tree = 356,305


## add network size data

In [16]:
if do_weighted:
    clean_features_with_classification = clean_features_with_classification.loc[clean_features_with_classification.network_type == "weighted"]
else:
    clean_features_with_classification = clean_features_with_classification.loc[clean_features_with_classification.network_type != "weighted"]

In [17]:
def get_network_size(net_path):
    try:
        net = pd.read_csv(net_path).set_index("Plant")
        num_plants = net.shape[0]
        num_pollinators = net.shape[1]
        return num_plants, num_pollinators
    except:
        return np.nan, np.nan

network_to_size = clean_features_with_classification[["network_type", "network"]].drop_duplicates()
network_to_size[["num_network_plants", "num_network_pollinators"]] = network_to_size.parallel_apply(lambda record: get_network_size(f"{networks_dir}{record.network_type}/{record.network}.csv"), axis=1, result_type="expand") 
    
clean_features_with_classification = clean_features_with_classification.merge(network_to_size, on=["network_type", "network"], how="left")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=22), Label(value='0 / 22'))), HBox…

In [18]:
tree_names = set(allmb_tree.get_leaf_names())
final_features_with_classification = clean_features_with_classification.loc[clean_features_with_classification.original_name.isin(tree_names) |
                                                                            clean_features_with_classification.resolved_name.isin(tree_names)]
final_features_with_classification["mapped_name"] = final_features_with_classification.apply(lambda record: record.original_name if record.original_name in tree_names else record.resolved_name, axis=1)
print(f"# species from networks for analysis = {len(final_features_with_classification.mapped_name.unique()):}")

# species from networks for analysis = 1026


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_features_with_classification["mapped_name"] = final_features_with_classification.apply(lambda record: record.original_name if record.original_name in tree_names else record.resolved_name, axis=1)


In [19]:
pruned_allmb_tree = allmb_tree.copy()
pruned_allmb_tree.prune(final_features_with_classification.mapped_name.unique().tolist(), preserve_branch_length=True)
print(f"# species in tree = {len(pruned_allmb_tree.get_leaf_names()):,}")

# species in tree = 1,026


In [20]:
final_features_with_classification["mapped_name"] = final_features_with_classification["mapped_name"].str.replace(" ", "_")
final_features_with_classification["sample_id"] = final_features_with_classification["mapped_name"] + "_" + final_features_with_classification["network_type"] + "_" + final_features_with_classification["network"].astype("str")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_features_with_classification["mapped_name"] = final_features_with_classification["mapped_name"].str.replace(" ", "_")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_features_with_classification["sample_id"] = final_features_with_classification["mapped_name"] + "_" + final_features_with_classification["network_type"] + "_" + final_features_with_classification["network"].astype("str")


In [21]:
final_features_with_classification.sample_id = final_features_with_classification.sample_id.str.replace(" ", "_")
for l in pruned_allmb_tree.get_leaves():
    l.name = l.name.replace(" ", "_")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_features_with_classification.sample_id = final_features_with_classification.sample_id.str.replace(" ", "_")


In [22]:
final_features_with_classification = final_features_with_classification.drop_duplicates("sample_id")

In [23]:
assert(len(final_features_with_classification.sample_id.unique().tolist()) == final_features_with_classification.shape[0])

In [24]:
# add bushes under each leaf
num_added_nodes =0
tree_leaves = pruned_allmb_tree.get_leaves()
for leaf in tree_leaves:
    children_names = final_features_with_classification.query(f"mapped_name == '{leaf.name}'").sample_id.tolist()
    if len(children_names) > 0:
        for child_name in children_names:
            leaf.add_child(name=child_name, dist=0.00001)
        num_added_nodes += len(children_names)
    else:
        print(f"no child for leaf {leaf.name}")
        leaf.dist = leaf.dist + 0.00001
    
print(f"# added nodes = {num_added_nodes:,}")

# added nodes = 3,470


In [25]:
print(f"# samples in data = {final_features_with_classification.shape[0]:,}")
print(f"# leaves in tree = {len(pruned_allmb_tree.get_leaf_names()):,}")

# samples in data = 3,470
# leaves in tree = 3,470


In [26]:
final_features_with_classification["network_id"] = final_features_with_classification.network_type.astype(str) + "_" + final_features_with_classification.network.astype(str)

In [27]:
final_features_with_classification.to_csv(processed_data_path)
pruned_allmb_tree.write(outfile=processed_tree_path)

In [28]:
species = final_features_with_classification.mapped_name.str.replace("_"," ").unique()
pd.Series(species).to_csv(species_list_path)