You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# Removing low quality samplesidx_samples_poorqc=ds_otus[None].sum(axis=1).sort_values().index[:6] # ds_otus[None] is the original counts table used to build the Datasetwithsy.Suppress():
sy.visuals.plot_compositional(X=ds_otus[None], y=ds_otus.y, class_colors=colors_status.class_colors, show_annotations=idx_samples_poorqc, attr_type="OTU", show_ygrid=False, style="seaborn-white", background_color="white", title="Quality Control Samples")
y_phenotype=df_meta_samples["Nutritional_Status"].drop(idx_samples_poorqc)
# Add versions# QC of samplesds_otus.add_version("QC(samples)", ds_otus[None].drop(idx_samples_poorqc))
# QC of samples, remove OTUS in fewer than 13 samplesds_otus.add_version(("QC(samples)", ("prevalence",13)), data=sy.utils.filter_compositional(ds_otus["QC(samples)"], tol_prevalence=13))
# QC of samples, remove OTUS in fewer than 13 samples, CLR normalizationds_otus.add_version(("QC(samples)", ("prevalence",13), "clr"), data=sy.transmute.transform_clr(ds_otus[("QC(samples)", ("prevalence",13))]))
# QC of samples, remove OTUS in fewer than 13 samples, CLR normalization, Z-score normalization for multimodal associationsds_otus.add_version(("QC(samples)", ("prevalence",13), "clr", "zscore"), data=sy.transmute.normalize(ds_otus[("QC(samples)", ("prevalence",13), "clr")], "zscore", axis=0))
# Set default versionds_otus.set_default(("QC(samples)", ("prevalence", 13)))
# Counts of OTUs at various taxonomy levelstaxonomy_counts=dict()
taxonomy_counts["OTU"] =ds_otus["QC(samples)"]
taxonomy_counts.update(dict(map(lambdax: (x, taxonomy_counts["OTU"].groupby(ds_otus.metadata_attributes[x], axis=1).sum()), ['Life', 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'])))
print(*map(lambdax: "{} {}".format(x[0], x[1].shape), taxonomy_counts.items()), sep=" | ")
# Prevalence of OTUssy.visuals.plot_prevalence(taxonomy_counts["OTU"], attr_type="OTU", obsv_type="sample", title="OTU Prevalence", show_prevalence=[5,6,13, 20], figsize=(13,3), title_kws={"fontsize":15})
# Reset the colorscolors_status=sy.Chromatic.from_classes(df_meta_samples.drop(idx_samples_poorqc)["Nutritional_Status"], name="Nutritional_Status", class_type="status", obsv_type="sample", palette="Set2")
print("Current default: {} {}".format(ds_otus.X_version, ds_otus.X.shape))
ds_otus
OTU (107, 388) | Life (107, 1) | Domain (107, 1) | Phylum (107, 15) | Class (107, 28) | Order (107, 40) | Family (107, 63) | Genus (107, 164) | Species (107, 227)
Current default: ('QC(samples)', ('prevalence', 13)) (107, 155)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/compositional/compositional.py:193: UserWarning: N=1 zeros detected in `X`. Masking zeros as NaN and will default to nan-robust functions if 'mean' or 'median' were provided for centroid
warnings.warn("N={} zeros detected in `X`. Masking zeros as NaN and will default to nan-robust functions if 'mean' or 'median' were provided for centroid".format(n_zeros))
Dataset| Gut Microbiome | (113, 388)
====================================
obsv_type: sample
attr_type: OTU
metric_type: None
description: SILVA_123_SSURef_Nr99_tax_silva
datasets: [None, 'QC(samples)', ('QC(samples)', ('prevalence', 13)), ('QC(samples)', ('prevalence', 13), 'clr'), ('QC(samples)', ('prevalence', 13), 'clr', 'zscore')]
attribute_subsets: [None]
observation_subsets: [None]
metadata_observations: 13
metadata_attributes: 10
default: ('QC(samples)', ('prevalence', 13)) | (107, 155)
2021-10-25 20:26:16
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
warnings.warn(msg, FutureWarning)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2056: FutureWarning: The `axis` variable is no longer used and will be removed. Instead, assign variables directly to `x` or `y`.
warnings.warn(msg, FutureWarning)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
warnings.warn(msg, FutureWarning)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2056: FutureWarning: The `axis` variable is no longer used and will be removed. Instead, assign variables directly to `x` or `y`.
warnings.warn(msg, FutureWarning)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
warnings.warn(msg, FutureWarning)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2056: FutureWarning: The `axis` variable is no longer used and will be removed. Instead, assign variables directly to `x` or `y`.
warnings.warn(msg, FutureWarning)
MAM 22.46150016784668
SAM 30.520000457763672
WN 25.36849975585938
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 6.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 6.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
167
Relative abundance of Family-level taxonomy
defdifferential_abundance(X:pd.DataFrame, y:pd.Series, reference_class=None, method="ALDEx2", return_type="dataframe", aldex2_kws=dict(), random_state=0):
# Assertionsassert_acceptable_arguments(return_type, {"dict", "dataframe"})
assertmethod=="ALDEx2", "Currently, only `algorithm='ALDEx2' is available"assertnp.all(X.shape[0] ==y.size), "X.shape[0] != y.size"assertnp.all(X.index==y.index), "X.index != y.index"# Wrappers defrun_aldex2(X, y, kws):
r_X=pandas_to_rpy2(X.T)
r_y=pandas_to_rpy2(y)
results=aldex2.aldex(
reads=r_X,
conditions=r_y,
**_aldex2_kws,
)
returnrpy2_to_pandas(results)
# ALDEx2ifmethod=="ALDEx2":
ro.r('set.seed')(random_state)
# Packagealdex2=R_package_retrieve("ALDEx2")
_aldex2_kws=dict(test="t", effect=True, mc_samples=128, denom="all")
_aldex2_kws.update(aldex2_kws)
# Multiclassclasses=set(y.unique())
iflen(classes) >2:
assertreference_classisnotNone, "Please provided a `reference_class` control condition"assertreference_classinclasses, "`reference_class={}` is not in `y`".format(reference_class)
multiclass_results=dict()
forquery_classinsorted(classes- {reference_class}):
# Subset the samples to include `query_class` and `reference_class`y_subset=y[y.map(lambdaid_sample: id_samplein {query_class, reference_class})]
X_subset=X.loc[y_subset.index]
# Run ALDEx2multiclass_results[query_class] =run_aldex2(X=X_subset, y=y_subset, kws=_aldex2_kws)
# Return a dictionary object {query_class:results}ifreturn_type=="dict":
returnmulticlass_results# Return a multiindex pd.DataFrameifreturn_type=="dataframe":
dataframes=list()
forid_class, dfinmulticlass_results.items():
df.columns=df.columns.map(lambdametric: (id_class, reference_class, metric))
df.columns.names= ["Treatment", "Reference", "Metric"]
dataframes.append(df)
returnpd.concat(dataframes, axis=1)
# 2 Classeselse:
returnrun_aldex2(X=X, y=y, kws=_aldex2_kws)
aldex2_results=dict()
denom="all"fortaxon_level, dfinsy.pv(taxonomy_counts.items(), "ALDEx2", unit=" Taxonomy level"):
iftaxon_levelnotin {'Life', 'Domain', 'Phylum', 'Class', 'Order', 'Species', 'Genus'}:
X=sy.utils.filter_compositional(df, tol_prevalence=13)
withsy.utils.Suppress(show_stdout=True):
aldex2_results[taxon_level] =differential_abundance(X=X, y=y_phenotype[X.index], reference_class="WN", aldex2_kws={"test":"t", "denom":denom})
Pruned 233 attributes to match components (X.columns)
Inferred association as `dissimilarity`
(<Figure size 576x576 with 1 Axes>,
<matplotlib.axes._subplots.AxesSubplot at 0x7fb0ccd82dc0>,
<matplotlib.collections.PathCollection at 0x7fb0ccdcbf10>)
SAM p = 4.6888e-19
WN p = 8.0032e-05
MAM p = 0.012502
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2056: FutureWarning: The `axis` variable is no longer used and will be removed. Instead, assign variables directly to `x` or `y`.
warnings.warn(msg, FutureWarning)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/seaborn/distributions.py:2056: FutureWarning: The `axis` variable is no longer used and will be removed. Instead, assign variables directly to `x` or `y`.
warnings.warn(msg, FutureWarning)
1.4238907404295364e-05
<ipython-input-22-d0bbdb925446>:3: FutureWarning: Index.__and__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__and__. Use index.intersection(other) instead
for id_class, df in ds_pathogens["aggregated"][ds_pathogens["aggregated"].columns & ds_multimodal["residuals"].columns].groupby(y_phenotype):
# Sample-specific networksnew_to_old=sy.utils.dict_reverse(old_to_new.to_dict())
_X=ds_multimodal["residuals"].copy()
_X.index=_X.index.map(lambdax: new_to_old[x])
_y=ds_multimodal.y.copy()
_y.index=_y.index.map(lambdax: new_to_old[x])
t=time.time()
sspn=enx.SampleSpecificPerturbationNetwork(name="v5", node_type="multimodal", edge_type="correlation", observation_type="visit", assert_symmetry=False)
sspn.fit(
# X=ds_multimodal["residuals"], # y=ds_multimodal.y, X=_X,
y=_y,
reference="WN_",
metric="pearson",
stats_summary=[np.nanmean],
stats_tests=None,
stats_summary_initial=[np.nanmean],
copy_ensemble=True,
)
# Compare SSN to referenceµ_reference=sspn.ensemble_reference_.stats_["nanmean"]
µ_query=pd.DataFrame({id_sample:sspn.ensembles_samplespecific_[id_sample].stats_["nanmean"] forid_sampleinsspn.ensembles_samplespecific_}).TX_edges=µ_query-µ_reference# Remove edges that are emptyidx_edges_empty= (X_edges==0).sum(axis=0)[lambdax: x==X_edges.shape[0]].indexX_edges=X_edges.drop(idx_edges_empty, axis=1).dropna(how="any", axis=1)
# Create dataset for perturbation profilesindex=sy.io.read_object("./Data/sspn.index.list.pkl") # Match the index of when it was initially runindex=pd.Index(index).map(lambdaid_sample: old_to_new[id_sample])
columns=sy.io.read_object("./Data/sspn.columns.list.pkl") # Match the index of when it was initially runX_edges.index=X_edges.index.map(lambdaid_sample: old_to_new[id_sample])
ds_perturbations=sy.Dataset(
data=X_edges.loc[index, columns],
metadata_observations=df_meta_samples.loc[index],
metadata_target_field="Nutritional_Status",
name="perturbation_matrix",
obsv_type="sample",
attr_type="edge",
metric_type="perturbation",
)
# View the datasetds_perturbations# Durationprint("SSPN Duration: {}".format(sy.utils.format_duration(t)))
# Let's check the clustering based on the perturbation profiles# Pairwise Euclidean distance of the perturbationsdf_dism=sy.symmetry.pairwise(ds_perturbations.X, metric="euclidean", axis=0)
# PCoAtitle="Unsupervised clustering of SSPNs (m= {} edges)".format(ds_perturbations.X.shape[1])
sy.ordination.PrincipalCoordinatesAnalysis(dism=df_dism).plot( c=colors_status.obsv_colors, title=title, legend=colors_status.class_colors)
# sy.ordination.PrincipalCoordinatesAnalysis(dism=df_dism).plot( c=colors_status.obsv_colors)# Hierarchical clusteringahc=sy.Agglomerative(df_dism)
ahc.add_secondary_class("Nutritional_Status", y[df_dism.index], class_colors=colors_status.class_colors)
ahc.add_track("WHZ", ds_perturbations.metadata_observations["WHZ"])
ahc.plot(title=title)
# Spoiler alert, doesn't look very good but just wait...
Inferred association as `dissimilarity`
(<Figure size 1512x360 with 3 Axes>,
[<matplotlib.axes._subplots.AxesSubplot at 0x7fb08fb8dd60>,
<matplotlib.axes._axes.Axes at 0x7fb08f9dcc10>,
<matplotlib.axes._axes.Axes at 0x7fad8b428520>])
Leveraging Clairvoyance feature selection for phenotype-discriminative community detection
# Determine data-driven hierarchical structure topology=sy.Topology(ds_perturbations.X.copy(), ds_perturbations.y.copy(), class_type="status", obsv_type="perturbation", attr_type="edge", class_colors=colors_status.class_colors) #ds_perturbations.X is equivalent to ds_perturbations[None] because theres only one versionfig, _=topology.plot_topology(figsize=(5,5))
fig.axes[-1].set_xlabel("Nutritional Status", fontsize=15, fontweight="bold")
# Set up directories for running Clairvoyance feature selectionsubmodel_data=defaultdict(dict)
forsubmodel, y_submodelintopology.get_target_matrix().T.iterrows():
y_submodel=y_submodel.dropna()
X_submodel=ds_perturbations.X.loc[y_submodel.index]
subjects=sorted(ds_perturbations.metadata_observations.loc[X_submodel.index, "SubjectID(Alias)"].unique())
# Get LSOCV pairscv_pairs=defaultdict(dict)
forid_subjectinsubjects:
mask=X_submodel.index.map(lambdaid_visit: ds_perturbations.metadata_observations.loc[id_visit, "SubjectID(Alias)"] ==id_subject)
idx_te=np.where(mask)[0]
idx_tr=np.delete(sy.utils.range_like(mask), idx_te)
cv_pairs[id_subject]["Training"] =idx_tr.tolist()
cv_pairs[id_subject]["Testing"] =idx_te.tolist()
# Assert that each test set has samples from one subject. This is set up so the training data has not seen this subject beforeassertds_perturbations.metadata_observations.loc[y_submodel.index[idx_te], "SubjectID(Alias)"].nunique() ==1, "{} is not homogenous for subject".format(id_subject)
# Format dataframescv_submodel=pd.DataFrame(cv_pairs).T.loc[:,["Training", "Testing"]]
cv_submodel.index.name="VisitID"submodel_data[submodel]["cv"] =cv_submodelsubmodel_data[submodel]["X"] =X_submodelsubmodel_data[submodel]["y"] =y_submodel# # Serialize the dataframes# os.makedirs("./Data/modeling/", exist_ok=True)# sy.io.write_dataframe(X_submodel, "./Data/feature_selection/X.pbz2".format(submodel))# sy.io.write_dataframe(y_submodel.to_frame(), "./Data/feature_selection/{}/y.pbz2".format(submodel))# sy.io.write_dataframe(df_cv, "./Data/feature_selection/{}/cv.tsv".format(submodel))
Bash script for running Clairvoyance on compute server
source activate soothsayer_env
VERSION="v5.0"
SUBMODEL="y1"# The same command was run for all sub-models
NAME="${VERSION}_${SUBMODEL}"
X="X.pbz2"
y="y.pbz2"
python ~/soothsayer/standalone/soothsayer_clairvoyance.py \
-X ${X} \
-y ${y} \
--cv 10 \
--model_type logistic,tree \
--n_iter 500 \
--name ${NAME} \
--min_threshold None,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9 \
--percentiles 0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99 \
--method bruteforce \
--early_stopping 100 \
--out_dir clairvoyance_output \
--attr_type edge \
--class_type status \
--n_jobs 48 \
--save_model False
Notes about change in LogisticRegression default solver between v0.21-v0.22:
Clairvoyance was originally run on this dataset using scikit-learn v0.21 where the default LogisticRegression(v0.21) was liblinear.
This means the models were optimized using the liblinear solver and we need to manually set the solver when instantiating the models.
# Load in Clairvoyance dataclairvoyance_data=defaultdict(dict)
param_defaults=dict(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
forsubmodelintopology.submodels: # ["y1", "y2"]path="./Data/feature_selection/{}/v5.0_{}__synopsis/v5.0_{}__synopsis.tsv".format(submodel, submodel, submodel)
# Load in the synopsis and evaluate the hyperparameter and edge_set columns because these are actually python objectsdf_synopsis=sy.io.read_dataframe(path, evaluate_columns=["hyperparameters", "edge_set"])
df_synopsis["edge_set"] =df_synopsis["edge_set"].map(lambdax:list(map(eval, x)))
clairvoyance_data[submodel]["synopsis"] =df_synopsis# Determine best sub-model configurationsbest_params=sy.feature_extraction.get_best_model_from_algorithm(df_synopsis, into=dict)
clairvoyance_data[submodel].update(best_params)
# Get edge setedge_set=pd.Index(sorted(clairvoyance_data[submodel]["features"]))
clairvoyance_data[submodel]["features"] =clairvoyance_data[submodel]["edges"] =edge_set# Get nodes from edge setnode_set=pd.Index(sorted(sy.utils.flatten(edge_set, set)))
clairvoyance_data[submodel]["nodes"] =node_set# Performanceclf_submodel=clairvoyance_data[submodel]["clf"]
clf_submodel.set_params(solver="liblinear") # See note in above cell# Get training, testing, and cross-validation dataX_submodel=submodel_data[submodel]["X"].loc[:,clairvoyance_data[submodel]["edges"]]
y_submodel=submodel_data[submodel]["y"]
cv_submodel=submodel_data[submodel]["cv"]
# Cross-validate using LSOCV pairsforid_subject, (idx_tr, idx_te) incv_submodel.iterrows():
# Assert that each test set has samples from one subject. This is set up so the training data has not seen this subject beforeassertds_perturbations.metadata_observations.loc[y_submodel.index[idx_te],"SubjectID(Alias)"].nunique() ==1, "{} is not homogenous for subject".format(id_subject)
clairvoyance_data[submodel]["cross_validation"] =pd.Series(model_selection.cross_val_score(estimator=clf_submodel,
X=X_submodel.values,
y=y_submodel.values,
cv=cv_submodel.values.tolist(),
n_jobs=1,
), index=cv_submodel.index, name=submodel)
print("Sub-model({}) has LSOCV accuracy of {}".format(submodel, clairvoyance_data[submodel]["cross_validation"].mean()))
# Fitclairvoyance_data[submodel]["clf"] =clf_submodel.fit(X_submodel, y_submodel)
Sub-model(y1) has LSOCV accuracy of 1.0
Sub-model(y2) has LSOCV accuracy of 1.0
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/soothsayer/feature_extraction/feature_extraction.py:66: UserWarning: Multiple instances with best accuracy, lowest sem, and number of features. Choosing first option.
warnings.warn(multiple_hits_message)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/soothsayer/feature_extraction/feature_extraction.py:74: UserWarning: Multiple instances with best accuracy, lowest sem, and number of features. Choosing first option.
warnings.warn(multiple_hits_message)
# Set up training dataX_training=ds_perturbations.XY_training=topology.get_target_matrix().loc[X_training.index]
y_training=topology.y[X_training.index]
# Create datasetds_training=sy.Dataset(
data=X_training,
metadata_observations=Y_training,
name="HEC",
)
# Add sub-model subsets as versionsforsubmodelintopology.submodels:
X_submodel=submodel_data[submodel]["X"]
y_submodel=submodel_data[submodel]["y"]
cv_submodel=submodel_data[submodel]["cv"]
# Training datads_training.add_version(
name_version=submodel,
data=submodel_data[submodel]["X"],
X=X_submodel,
y=y_submodel,
cv_labels=cv_submodel.index,
cv=cv_submodel.values.tolist(),
clf=clairvoyance_data[submodel]["clf"],
features=clairvoyance_data[submodel]["features"],
edges=clairvoyance_data[submodel]["edges"],
nodes=clairvoyance_data[submodel]["nodes"],
hyperparameters=clairvoyance_data[submodel]["hyperparameters"],
)
# Create the HEC modelmodel=HEC(
name="v5.0",
attr_type="edge",
obsv_type="sample",
class_type="nutritional_status",
verbose=True
)
# Add paths in the graphpaths=topology.get_paths()
model.add_paths(topology.get_paths())
# -------------------------------------------------# Model fitting and cross-validation [Full Dataset]# -------------------------------------------------# Add submodels to HECforsubmodelintopology.submodels:
# Training dataX_submodel=ds_training.get_dataset_field(submodel, "X")
y_submodel=ds_training.get_dataset_field(submodel, "y")
# Clairvoyance-optimized gene setsfeatures=ds_training.get_dataset_field(submodel, "features") # Let's keep it agnostic to biological concepts at this point# Logistic Regression sub-models with hyperparameters defined earlierclf=ds_training.get_dataset_field(submodel, "clf")
# Custom cross-validation pairscv_labels=ds_training.get_dataset_field(submodel, "cv_labels")
cv=ds_training.get_dataset_field(submodel, "cv")
index=X_submodel.index# Add sub-modelmodel.add_submodel(name=submodel, clf=clf, attributes=features.tolist(), cv=cv, index=index, cv_labels=cv_labels)
# Fit sub-modelmodel.get_classifier(submodel).fit(X_submodel.loc[:,features], y_submodel)
# Fit the model using the attribute and target matricesmodel.fit(X_training, Y_training)
# Evaluate the modelprint("HEC model LSOCV accuracy", model.cross_validate(X_training, Y_training).mean())#, mode="directional" )# # Save model# model.save_model(path="./Data/HEC.pkl") # Will probably change this to .to_file in future versions# # Load model# model = sy.io.read_object("./Data/HEC.pkl")
Cross-validating: 34it [00:00, 37.70it/s]
HEC model LSOCV accuracy 1.0
# Manually cross-validate the HEC modelroot_submodel=model.root_submodelassertroot_submodel=="y1", "`root_submodel` should be 'y1'"#root_submodel = "y1". Using `y1` b/c it is the root decision that all of the data passes through in the HECY_hats=list()
for (cross_validation_label, (idx_tr, idx_te)) intqdm(zip(ds_training.get_dataset_field(root_submodel, "cv_labels"), ds_training.get_dataset_field(root_submodel, "cv")), "Manual cross-validation"):
# Partition training/testing setsX_tr=X_training.iloc[idx_tr,:]
X_te=X_training.iloc[idx_te,:]
Y_tr=Y_training.iloc[idx_tr,:]
Y_te=Y_training.iloc[idx_te,:]
# Fit modelmodel.fit(X_tr, Y_tr, ignore_submodel_index=True) # Use `ignore_submodel_index` when manually cross-validating# Get probabilities per sub-modelY_hat=model.predict_proba(X_te )
Y_hats.append(Y_hat)
Y_hats=pd.concat(Y_hats)
Manual cross-validation: 34it [00:00, 36.55it/s]
model.get_classifier("y1").classes_
array(['WN', 'y2'], dtype=object)
Interpreting the HEC model
# Extract fitted edges weights per sub-modeldf_AN_connectivity__edges=dict()#pd.DataFrame()forsubmodelinmodel.submodels:
edge_weights=pd.Series(clairvoyance_data[submodel]["clf"].coef_.ravel(), index=clairvoyance_data[submodel]["features"], name=submodel)
df_AN_connectivity__edges[(submodel, "Weight")] =edge_weights.to_dict() # Note: Can use this when not manually cross-validating submodels within the model object: pd.Series(model.get_classifier(submodel).coef_.flatten(), index=model.get_attributes(submodel), name=submodel)df_AN_connectivity__edges[(submodel, "|Weight|")] =edge_weights.abs().to_dict()
# View the weightsdf_AN_connectivity__edges=pd.DataFrame(df_AN_connectivity__edges)
df_AN_connectivity__edges.columns=pd.MultiIndex.from_tuples(df_AN_connectivity__edges.columns, names=["Sub-model", "Signed"])
df_AN_connectivity__edges.head()
Inferred association as `dissimilarity`
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/soothsayer/ordination/ordination.py:74: UserWarning: Using `show_features` to create a `biplot` is experimental.
warnings.warn("Using `show_features` to create a `biplot` is experimental.")
Selected 4 features:
[('y1', 'WN'), ('y1', 'y2'), ('y2', 'MAM'), ('y2', 'SAM')]
(<Figure size 576x360 with 1 Axes>,
<matplotlib.axes._subplots.AxesSubplot at 0x7fb08ee33a90>,
<matplotlib.collections.PathCollection at 0x7fb08eded100>)
<ipython-input-37-0fd3101bf651>:12: FutureWarning: Index.__or__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__or__. Use index.union(other) instead
for node in sy.utils.flatten(clairvoyance_data["y1"]["edges"] | clairvoyance_data["y2"]["edges"], set):
(147, 4) (265, 4)
# Hive plotsaggregate_networks=dict()
forsubmodelinmodel.submodels:
# Create a full-connected network to use as input for hive plotssym=hx.Symmetric(df_AN_connectivity__edges[(submodel, "Weight")])
# Fill the empty edge weights with zerossym.weights=sym.weights.fillna(0)
aggregate_networks[submodel] =sym# Get colors for edges based on signedge_colors=sym.weights.map(lambdax:{True:sy.utils.COLOR_POSITIVE, False:sy.utils.COLOR_NEGATIVE}[x>0])
# Create Hivehive=hx.Hive(sym, name=submodel)
# Get non-microbiome nodes to label (there are too many OTUs to label)show_node_labels=sy.utils.flatten(sym.weights.index[sym.weights.index.map(lambdaedge: list(map(lambdanode: y_modality[node], edge)).count("microbiome") <1)], set)
# Add each modality as an axisformodality, indexinsorted(sy.utils.pd_series_collapse(y_modality, type_collection=pd.Index).items(), key=lambdax:{"microbiome":0, "clinical":1, "pathogen":2}[x[0]]):
nodes=sorted(index&sym.nodes)
hive.add_axis(name_axis=modality, nodes=nodes, colors=colors_modality.class_colors[modality], split_axis=True)
hive.compile(inner_radius=100)
# Plot the hivehive.plot(style="white", edge_colors=edge_colors, func_edgeweight=lambdaw:w*10, show_node_labels=show_node_labels, figsize=(10,10), pad_axis_label=-150, title="AN$_{%s}$"%(submodel))
<ipython-input-38-535c33320612>:17: FutureWarning: Index.__and__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__and__. Use index.intersection(other) instead
nodes = sorted(index & sym.nodes)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/hive_networkx/hive_networkx.py:765: FutureWarning: Index.__and__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__and__. Use index.intersection(other) instead
index_labels = pd.Index(show_node_labels) & index_labels
<ipython-input-38-535c33320612>:17: FutureWarning: Index.__and__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__and__. Use index.intersection(other) instead
nodes = sorted(index & sym.nodes)
/Users/jespinoz/anaconda3/envs/soothsayer_py3.8_env/lib/python3.8/site-packages/hive_networkx/hive_networkx.py:765: FutureWarning: Index.__and__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__and__. Use index.intersection(other) instead
index_labels = pd.Index(show_node_labels) & index_labels
cmap=plt.cm.cubehelix_rwithplt.style.context("seaborn-white"):
fig=plt.figure(figsize=(8,8))
ax=fig.gca(projection='3d')
# Make data.steps=0.1618X=np.arange(-2, 2, steps)
Y=np.arange(-2, 2, steps)
X, Y=np.meshgrid(X, Y)
Z=recovery_score(X, Y)
# Plot the surface.surf=ax.plot_surface(X, Y, Z, cmap=cmap,rstride=1, cstride=1,
linewidth=0, antialiased=True, shade=True, vmin=0, vmax=1)
# Customize the z axis.ax.set_xlabel("$y$ ($t_{n+1}$ | WN)", fontsize=15)
ax.set_ylabel("$x$ ($t_{n}$ | MAM or SAM)", fontsize=15)
ax.zaxis.set_rotate_label(False)
ax.set_zlabel("Recovery Score [ $r$ ]", fontsize=15, rotation=90)
# ax.plot(xs=[-2,2], ys=[-2,2], zs=[0,0], color="black", linewidth=1, alpha=0.618)# ax.zaxis.set_major_locator(LinearLocator(10))# ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))ax.set_xlim(-2,2)
ax.set_ylim(-2,2)
ax.set_zlim(0, 1.0)
ax.view_init(15, 60)
# Add a color bar which maps values to colors.# fig.colorbar(surf, aspect=5, shrink=0.5)#, shrink=0.25, aspect=5)
withplt.style.context("seaborn-white"):
fig, ax=plt.subplots(figsize=(5,5))
# Make data.X=np.arange(-2, 2, 0.25)
Y=np.arange(-2, 2, 0.25)
X, Y=np.meshgrid(X, Y)
Z=recovery_score(X, Y)
# Plot the surface.contour=ax.contourf(X, Y, Z, cmap=cmap,rstride=1, cstride=1, shade=True, levels=50,
linewidth=0.1, antialiased=True,vmin=0, vmax=1)
# # Customize the z axis.# ax.set_zlim(0, 1.0)# ax.view_init(15, 60)ax.set_xlabel("$y$ ($t_{n+1}$ | WN)", fontsize=15)
ax.set_ylabel("$x$ ($t_{n}$ | MAM or SAM)", fontsize=15)
ax.xaxis.grid(True)
ax.yaxis.grid(True)
# ax.axvline(0, color="black", linewidth=1, alpha=0.618)# ax.axhline(0, color="black")# ax.set_zlabel("Recovery Score", fontsize=15)# ax.zaxis.set_major_locator(LinearLocator(10))# ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))# Add a color bar which maps values to colors.# fig.colorbar(contour, aspect=5, shrink=0.5)#, shrink=0.25, aspect=5)sy.utils.add_cbar_from_data(fig, cmap=cmap, vmin=0, vmax=1, cbar_pos=[0.925, 0.1, 0.05, 0.8], label="Recovery Score [ $r$ ]", cbar_kws={"fontsize":15})
<ipython-input-44-31f9044cbf1e>:12: UserWarning: The following kwargs were not used by contour: 'rstride', 'cstride', 'shade', 'linewidth'
contour = ax.contourf(X, Y, Z, cmap=cmap,rstride=1, cstride=1, shade=True, levels=50,