In [1]:
import os
import xlrd
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from itertools import product
from collections import Counter
from scipy.spatial import distance
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from statsmodels.stats import multitest
from meta.scripts.Utilities import Utilities
from sklearn.preprocessing import StandardScaler

In [2]:
SINGLE_COMPARISON_ALPHA = 0.05

In [3]:
def sanitize_series(s: pd.Series):
    return s.where(lambda x: x > 0).dropna().sort_values(ascending=False)

def dict_list_to_dataframe(list_: list, key_column_name: str, value_column_name: str, 
                           sort: bool = False):
    out = pd.DataFrame(list_).set_index(key_column_name).sort_index()
    out.columns.name = "values"
    if sort:
        out.sort_values(value_column_name, ascending=False, inplace=True)
    return out

def get_prevalents_dict(two_dim_array: list, names: list):
    out = dict(is_common_by_sum=False, is_common_by_median=False, prevalent_by_sum=names[
        get_prevalent_list_number(two_dim_array, func=sum)], prevalent_by_median=names[
        get_prevalent_list_number(two_dim_array, func=np.median)])
    for func, key in zip([sum, np.median], ["is_common_by_sum", "is_common_by_median"]):
        if all(i > 0 for i in map(func, two_dim_array)):
            out[key] = True
    return out

def get_u_test_dict(two_dim_array: list):
    p_value = 1
    if sum(map(sum, two_dim_array)) > 0:
        p_value = stats.mannwhitneyu(*two_dim_array).__getattribute__("pvalue")
    out = dict(
        p_value=p_value, is_significant_for_single_comparison=p_value < SINGLE_COMPARISON_ALPHA)
    return out

def get_distances_dict(two_dim_array: list):
    out = dict(bray_curtis_dissimilarity=distance.braycurtis(*two_dim_array), 
               manhattan_distance=distance.cityblock(*two_dim_array), 
               euclidean_distance=distance.euclidean(*two_dim_array))
    return out

def get_multi_test_df(pvals: list, names: list):
    reject, pvals_corrected, alpha_sidak, alpha_bonf = multitest.multipletests(
        pvals, alpha=SINGLE_COMPARISON_ALPHA, method="fdr_bh")
    out = pd.DataFrame([reject, pvals_corrected], index=("is_significant_for_multiple_comparison", "corrected_p_value"), columns=names).rename_axis(index="multi_test").transpose()
    out["corrected_alpha_for_Sidak_method"] = alpha_sidak
    out["corrected_alpha_for_Bonferroni_method"] = alpha_bonf
    return out

def pair_flat_list(list_: list):
    list_ = set(list_)
    out = []
    for pair in sorted(product(list_, list_)):
        append = sorted(set(pair))
        if len(append) > 1 and append not in out:
            out.append(append)
    return out

def get_prevalent_list_number(two_dim_array: list, func):
    """
    :param two_dim_array: 2D array
    :param func: Comparing function, e.g. sum() or numpy.median()
    :return: Number (zero-based) of list with the prevalent result
    """
    results = [func(i) for i in two_dim_array]
    return results.index(max(results))

def get_counter_df(list_: list):
    out = pd.DataFrame(Counter(list_).most_common(), columns=["words", "occurrences"]).set_index(
        "words")
    out["occurrences"] = out["occurrences"].astype(int)
    return out

def zfill_list(list_: list):
    list_ = [str(i) for i in list_]
    return [i.zfill(len(max(list_, key=len))) for i in list_]


In [4]:
output_dir = "/data1/bio/projects/tgrigoreva/stool_to_blood"
table_file = os.path.join(output_dir, "stool_blood_paired_only.xlsx")
table_df = pd.read_excel(table_file, encoding="utf-8")
table_df = table_df.loc[:, list(filter(lambda x: len(x.strip()) > 0 and "unnamed" not in x.lower(), 
                                       table_df.columns))]
table_df["patient_ages"] = table_df["Диагноз"].apply(lambda x: x.split("_")[0])
table_df["patient_diagnoses"] = table_df["Диагноз"].apply(lambda x: x.split("_")[-1])
taxonomy_columns = [i for i in table_df.columns if any(i.startswith(j) for j in ("k_", "Unassigned"))]

table_df.loc[table_df["ID"] == "166b", "type"] = "blood"
table_df = table_df.replace({"ЯК": "colitis", "здоровый": "normal", "ожирение": "obesity", 
                             "Ребенок": "child", "Взрослый": "adult"}).rename(
    columns={"ID": "sample_names", "type": "sample_sources", "N_human": "patient_ids"}).drop(
    "Диагноз", axis=1)
table_df["patient_ids"] = zfill_list(table_df["patient_ids"].values.tolist())
table_index_levels = ["sample_names", "sample_sources", "patient_ids", "patient_diagnoses", 
                      "patient_ages"]
table_df = table_df.set_index(table_index_levels).loc[:, taxonomy_columns].rename_axis(
    columns="taxa")
table_ds = pd.melt(table_df.reset_index(), id_vars=table_index_levels, value_vars=taxonomy_columns, 
                   value_name="relative_abundances")
table_ds = table_ds.loc[table_ds["relative_abundances"] > 0]

In [5]:
table_sample_names, table_sample_sources, table_patient_ids, table_diagnoses, table_ages = [
    sorted(set(table_df.index.get_level_values(i))) for i in table_index_levels]
diagnosis_group_pairs = pair_flat_list(table_diagnoses)

In [6]:

class DataHolder:
    """
    Holds per-group data
    """
    def __init__(self, age: str, diagnosis: str, sample_source: str):
        self.age = age
        self.diagnosis = diagnosis
        self.sample_source = sample_source
        
        self.raw_ds = table_ds.query(
            "patient_ages == '{}' and patient_diagnoses == '{}' and sample_sources == '{}'".format(
                self.age, self.diagnosis, self.sample_source))
        self.common_taxonomy_counter_df = get_counter_df(self.raw_ds["taxa"])
        
        # Payload for PCA plot
        if self.sample_source.strip() == "stool":
            self.edgecolors = "blue"
        else:
            self.edgecolors = "red"
        if self.diagnosis.strip() == "obesity":
            self.marker = "s"
        elif diagnosis.strip() == "colitis":
            self.marker = "^"
        else:
            self.marker = "o"
        if self.age.strip() == "child":
            self.facecolors = "none"
        else:
            self.facecolors = self.edgecolors

        # More payload
        self.taxonomy_df = self.raw_ds.pivot(
            index="sample_names", columns="taxa", values="relative_abundances").dropna(
            axis=1, how="all").fillna(0)
        self.sample_names = sorted(set(self.taxonomy_df.index.values))
        
        self.mean_df = pd.DataFrame(sanitize_series(
            self.taxonomy_df.mean(axis=0).rename("mean"))).rename_axis(index="taxa")
        self.median_df = pd.DataFrame(sanitize_series(
            self.taxonomy_df.median(axis=0).rename("median"))).rename_axis(index="taxa")
        self.sum_df = pd.DataFrame(sanitize_series(
            self.taxonomy_df.sum(axis=0).rename("sum"))).rename_axis(index="taxa")

        self.out_dir = os.path.join(output_dir, "single", str(self))

    @staticmethod
    def series_to_df(series: pd.Series, key_column_name: str, value_column_name: str):
        d = sanitize_series(series)
        lst_ = [{key_column_name: k, value_column_name: d.get(k)} for k in d]
        return dict_list_to_dataframe(lst_, key_column_name, value_column_name, sort=True)

    def __str__(self):
        return "[{}]".format(", ".join([self.age, self.diagnosis, self.sample_source]))

    def __repr__(self):
        return "<DataHolder({}) at {}, contains {} samples and {} taxa>".format(
            str(self), hex(id(self)), *self.taxonomy_df.shape)

    def __len__(self):
        return len(self.sample_names)

    def __lt__(self, other):
        return str(self) < str(other)
        
    def __eq__(self, other):
      return other is not None and str(self) == str(other) 

    def __ne__(self, other):
        return not self.__eq__(other)
    
    def __hash__(self):
      return hash(str(self))

    def finalize_df(self, df: pd.DataFrame):
        out_df = df.copy()
        for col_name, value in zip(["age", "diagnosis", "sample_source"], 
                                   [self.age, self.diagnosis, self.sample_source]):
            out_df[col_name] = value
        return out_df
    
    def export(self):
        out = {"positive medians for {}.tsv": self.median_df, 
               "positive sums for {}.tsv": self.sum_df,
               "pivot taxa for {}.tsv": self.taxonomy_df,
               "melted per-sample taxa for {}.tsv": self.raw_ds,
               "per-sample taxa counter for {}.tsv": self.common_taxonomy_counter_df}
        for k in out:
            v = out.get(k)
            if len(v.index.values) > 0:
                Utilities.dump_tsv(v.reset_index(), os.path.join(self.out_dir, k.format(str(self))))


data_holders = [DataHolder(*i) for i in list(
    product(table_ages, table_diagnoses, table_sample_sources))]

In [7]:
whole_common_taxonomy_df = table_df.loc[:, table_ds["taxa"]]
whole_scaled_common_taxonomy_df = pd.DataFrame(
    StandardScaler().fit_transform(whole_common_taxonomy_df), 
    columns=whole_common_taxonomy_df.columns, 
    index=whole_common_taxonomy_df.index.get_level_values("sample_names"))
whole_common_taxonomy_dir = os.path.join(output_dir, "common")

_ = [Utilities.dump_tsv(i, os.path.join(whole_common_taxonomy_dir, j)) for i, j in zip(
    [table_ds, whole_scaled_common_taxonomy_df.reset_index()], 
    ["whole_dataset.tsv", "whole_scaled_common_taxonomy.tsv"])]

whole_common_pca_df = pd.DataFrame(PCA(n_components=2).fit_transform(
    whole_scaled_common_taxonomy_df), columns=["PCA {}".format(i) for i in range(1, 3)], 
    index=whole_scaled_common_taxonomy_df.index)
common_pca_dir = os.path.join(whole_common_taxonomy_dir, "pca")

Utilities.dump_tsv(whole_common_pca_df.reset_index(), 
                   os.path.join(common_pca_dir, "whole_common_pca.tsv"))

In [8]:
plt.rcParams["figure.figsize"] = (28, 20)
fig, ax = plt.subplots()
ax.set_xlabel("Principal Component 1", fontsize = 15)
ax.set_ylabel("Principal Component 2", fontsize = 15)
ax.set_title("2-component PCA", fontsize = 20)

for data_holder in data_holders:
    data_holder.export()
    pca_x = whole_common_pca_df.loc[data_holder.sample_names, 
                                    [whole_common_pca_df.columns[0]]].values
    pca_y = whole_common_pca_df.loc[data_holder.sample_names, 
                                    [whole_common_pca_df.columns[1]]].values
    ax.scatter(facecolors=data_holder.facecolors, edgecolors=data_holder.edgecolors, 
               marker=data_holder.marker, label=str(data_holder), x=pca_x, y=pca_y)
    for idx, txt in enumerate(data_holder.sample_names):
        ax.annotate(txt, (pca_x[idx], pca_y[idx]), fontsize="xx-small")

ax.legend()
ax.grid()
plt.tight_layout()
plt.savefig(os.path.join(common_pca_dir, "whole_common_pca.png"), dpi=300)
plt.clf()
plt.close()

In [9]:
for data_holder in data_holders:
    x_col_name = "volume"
    y_col_name = "frequency"
    combined_sum_occurrences_df = pd.concat(
        [data_holder.sum_df, data_holder.common_taxonomy_counter_df], axis=1, sort=False).rename(
        columns={"sum": x_col_name, "occurrences": y_col_name}).rename_axis(index="taxa").fillna(0)
    img_title = "Regression plot between taxon occurrence {} and {} for {}".format(
        x_col_name, y_col_name, str(data_holder))
    Utilities.dump_tsv(combined_sum_occurrences_df, os.path.join(
        data_holder.out_dir, "regressions", "{}.tsv".format(img_title)))
    sns.set()
    plt.rcParams["figure.figsize"] = (28, 20)
    ax = sns.regplot(data=combined_sum_occurrences_df, x=x_col_name, y=y_col_name, fit_reg=False)
    _ = ax.set_title(img_title, fontsize = 20)
    for ax_line in range(0, len(combined_sum_occurrences_df.index.values)):
        ax.text(combined_sum_occurrences_df[x_col_name][ax_line], 
                combined_sum_occurrences_df[y_col_name][ax_line], 
                combined_sum_occurrences_df.index.values[ax_line], 
                fontweight="regular", horizontalalignment="left", color="black", fontsize=3, 
                rotation=-30, rotation_mode="anchor")
    plt.savefig(os.path.join(data_holder.out_dir, "regressions", "{}.png".format(img_title)), 
                dpi=300)
    plt.clf()
    plt.close()

In [10]:
for whole_group_metrics in ("mean", "median", "sum"):
    whole_group_correlation_df = pd.concat([getattr(i, "{}_df".format(whole_group_metrics)).rename(columns={whole_group_metrics: str(i)}) for i in data_holders], axis=1, sort=False).fillna(0).corr(method="spearman")
    pair_correlation_title = "Whole group correlation for {}".format(whole_group_metrics)
    whole_group_correlation_dir = os.path.join(whole_common_taxonomy_dir, "correlation", whole_group_metrics)
    Utilities.dump_tsv(whole_group_correlation_df, os.path.join(whole_group_correlation_dir, "{}.tsv".format(pair_correlation_title)))
    sns.set()
    plt.rcParams["figure.figsize"] = (10, 10)   
    cg = sns.clustermap(whole_group_correlation_df, metric="cityblock").fig.suptitle(pair_correlation_title, fontsize=10, y=0.995) 
    plt.tight_layout()
    
    plt.savefig(os.path.join(whole_group_correlation_dir, "{}.png".format(pair_correlation_title)), dpi=300)
    plt.clf()
    plt.close()

In [11]:
class DataHolderPair:
    """
    Holds super-group data as DataHolders with the common diagnosis and age values
    """
    def __init__(self, dh1: DataHolder, dh2: DataHolder):
        assert dh1 != dh2
        self.data_holder_1, self.data_holder_2 = (dh1, dh2)
        
        self.median_df = pd.concat([i.median_df.rename(
            columns={"median": str(i)}) for i in list(self)], axis=1, sort=False).rename_axis(
            index="taxa", columns="medians")
        self.sum_df = pd.concat([i.sum_df.rename(
            columns={"sum": str(i)}) for i in list(self)], axis=1, sort=False).rename_axis(
            index="taxa", columns="sums")

        self.pair_taxonomy_ds = pd.concat([i.taxonomy_df for i in list(self)], axis=0, sort=False)
        self.pair_names = [str(i) for i in list(self)]
        self.pair_common_taxa = sorted(set(self.sum_df.dropna(axis=0).index.values))
        self.u_test_df = pd.DataFrame()
        for pair_common_taxon in self.pair_common_taxa:
            pair_common_2d_array = [
                self.pair_taxonomy_ds.loc[i.sample_names, pair_common_taxon].values 
                for i in list(self)]
            self.u_test_df = pd.concat([self.u_test_df, pd.Series(dict(
                **get_prevalents_dict(pair_common_2d_array, self.pair_names), 
                **get_u_test_dict(pair_common_2d_array))).rename(pair_common_taxon)], axis=1, 
                                       sort=False)
        self.u_test_df = self.u_test_df.transpose().rename_axis(index="taxa", columns="u-test")
        self.distance_df = pd.concat([pd.Series(
            get_distances_dict(i.fillna(0).transpose().values)).rename(j) for i, j in zip(
            [self.median_df, self.sum_df], ["median", "sum"])], axis=1, sort=False)
        self.out_dir = os.path.join(output_dir, "paired", str(self))

    def export(self):
        Utilities.dump_tsv(self.median_df.reset_index(), 
                           os.path.join(self.out_dir, "raw medians for {}.tsv".format(str(self))))
        Utilities.dump_tsv(self.sum_df.reset_index(), 
                           os.path.join(self.out_dir, "raw sums for {}.tsv".format(str(self))))
        Utilities.dump_tsv(self.u_test_df.reset_index(), 
                           os.path.join(self.out_dir, "single u-test for {}.tsv".format(str(self))))

    def has_common_property(self, prop: str, value: str):
        assert prop in ("age", "diagnosis", "sample_source")
        _values = list(set([getattr(i, prop) for i in list(self)]))
        return len(_values) == 1 and _values[0] == value

    def has_common_props(self, props: dict):
        """
        :param props: dict {prop1: val1, prop2: val2...}
        :return: boolean
        """
        return all([self.has_common_property(prop=k, value=props.get(k)) for k in props])

    def __eq__(self, other):
      return other is not None and sorted(list(self)) == sorted(list(other))

    def __ne__(self, other):
        return not self.__eq__(other)
    
    def __hash__(self):
      return hash(str(self))
    
    def __str__(self):
        return " vs ".join(self.pair_names)
    
    def __repr__(self):
        return "DataHolderPair({})".format(str(self))

    def __iter__(self):
        for _dh in (self.data_holder_1, self.data_holder_2):
            yield _dh

data_holder_pairs = [DataHolderPair(*i) for i in pair_flat_list(data_holders)]
data_holder_pairs_grouped_by_sample_source = [{p: v for p, v in zip(["age", "diagnosis"], j)} 
                                              for j in product(table_ages, table_diagnoses)]

In [12]:
class PatientDataHolder:
    """
    Holds per-patient data
    """
    def __init__(self, name: str):
        self.name = name
        raw_ds = table_ds.loc[table_ds["patient_ids"] == self.name]
        self.sample_names, sample_sources = [sorted(raw_ds[i].unique()) 
                                             for i in ("sample_names", "sample_sources")]
        props = ("patient_diagnoses", "patient_ages")
        assert all(len(i) == 2 for i in (self.sample_names, sample_sources)) and all(len(
            raw_ds[j].unique()) == 1 for j in props)
        self.diagnosis, self.age = raw_ds.loc[:, props].values[0]
        self.taxonomy_df = raw_ds.pivot_table(
            index=["sample_names", "sample_sources"], columns="taxa", 
            values="relative_abundances").dropna(axis=1, how="all").fillna(0)
        distance_dict = get_distances_dict(self.taxonomy_df.values.tolist())
        distance_dict.update(dict(sample_names=" vs ".join(self.sample_names), 
                                  patient_diagnoses=self.diagnosis, patient_ages=self.age))
        self.distance_df = pd.DataFrame(pd.Series(distance_dict).rename(name)).rename_axis(
            index="metrics", columns="patient_ids").transpose()
        self.common_taxa = get_counter_df(raw_ds["taxa"]).where(
            lambda x: x > 1).dropna().index.values
        try:
            self.blood_to_stool_series = self.taxonomy_df.xs(
                "blood", level="sample_sources").iloc[0].divide(self.taxonomy_df.xs(
                "stool", level="sample_sources").iloc[0]).sort_values(ascending=False).rename(name)
        except KeyError:
            print(self.taxonomy_df)
    
    def __str__(self):
        return "[{}, {}, {}]".format(self.name, self.diagnosis, self.age)

    def __repr__(self):
        return "PatientDataHolder({})".format(str(self))


patient_data_holders = [PatientDataHolder(i) for i in set(table_ds["patient_ids"].values.tolist())]

In [13]:
distance_metrics = get_distances_dict(list(np.random.randn(2, 10))).keys()
whole_distance_metric_df = pd.concat([i.distance_df for i in patient_data_holders], axis=0, 
                                     sort=False).sort_index()
distance_dir = os.path.join(output_dir, "distances")

Utilities.dump_tsv(whole_distance_metric_df.reset_index(), 
                   os.path.join(distance_dir, "whole_distance_metrics.tsv"))

for distance_metric in distance_metrics:
    sns.set()
    plt.rcParams["figure.figsize"] = (10, 10)
    ax = sns.boxplot(data=whole_distance_metric_df, x="patient_ages", y=distance_metric, 
                     hue="patient_diagnoses", palette="Set3")
    img_title = "{} between {} samples per patient".format(
        distance_metric, " and ".join(table_sample_sources))
    _ = ax.set_title(img_title, fontsize = 20)
    plt.tight_layout()
    
    plt.savefig("{}.png".format(os.path.join(distance_dir, img_title)), dpi=300)
    plt.clf()
    plt.close()

In [14]:
distance_metric_comparison_ds = pd.DataFrame()
for distance_metric in distance_metrics:
    for age_group in table_ages:
        diagnosis_pair_comparison_ds = pd.DataFrame()
        for diagnosis_pair in pair_flat_list(table_diagnoses):
            diagnosis_pair_dict = dict(distance_metrics=distance_metric, patient_ages=age_group, 
                                       diagnoses=" vs ".join(diagnosis_pair))
            diagnosis_pair_2d_arr = [whole_distance_metric_df.query(
                "patient_ages == '{}' and patient_diagnoses == '{}'".format(
                    age_group, i))[distance_metric].values for i in diagnosis_pair]
            diagnosis_pair_dict.update(get_prevalents_dict(diagnosis_pair_2d_arr, diagnosis_pair))
            diagnosis_pair_dict.update(get_u_test_dict(diagnosis_pair_2d_arr))
            diagnosis_pair_comparison_ds = pd.concat([diagnosis_pair_comparison_ds, pd.Series(
                diagnosis_pair_dict).rename()], axis=1, sort=False)
            #
        diagnosis_pair_multi_test_df = get_multi_test_df(
            diagnosis_pair_comparison_ds.loc["p_value"].values, 
            diagnosis_pair_comparison_ds.columns)
        diagnosis_pair_comparison_ds = pd.concat([diagnosis_pair_comparison_ds.transpose(), 
                                                  diagnosis_pair_multi_test_df], axis=1, sort=False)
        distance_metric_comparison_ds = pd.concat(
            [distance_metric_comparison_ds, diagnosis_pair_comparison_ds], axis=0, sort=False, 
            ignore_index=True)
    
Utilities.dump_tsv(distance_metric_comparison_ds, os.path.join(
    distance_dir, "distance_metric_comparison_dataset.tsv"))

In [15]:
distance_metric_qualifying_df = pd.DataFrame()
for distance_metric in distance_metrics:
    distance_metric_qualifying_df = pd.concat(
        [distance_metric_qualifying_df, distance_metric_comparison_ds.loc[
            distance_metric_comparison_ds["distance_metrics"] == distance_metric, [
                i for i in distance_metric_comparison_ds.columns if i.startswith(
                    "is_significant_")]].astype(int).sum().rename(distance_metric)], axis=1, 
        sort=False)
 
distance_metric_qualifying_df = distance_metric_qualifying_df.transpose()

Utilities.dump_tsv(distance_metric_qualifying_df.reset_index(), os.path.join(
    distance_dir, "distance_metric_qualifying_table.tsv"))