In [1]:
import os
import xlrd
import pandas as pd
import seaborn as sns
from scipy import stats
from itertools import product
from collections import Counter
from scipy.spatial import distance
from matplotlib import pyplot as plt
from statsmodels.stats import multitest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from meta.scripts.Utilities import Utilities


def sanitize_series(s: pd.Series):
    return {i: s.get(i) for i in s.keys() if s.get(i) > 0}


def dict_list_to_dataframe(list_: list, key_column_name: str, value_column_name: str, 
                           sort: bool = False):
    out = pd.DataFrame(list_).set_index(key_column_name).sort_index()
    out.columns.name = "values"
    if sort:
        out.sort_values(value_column_name, ascending=False, inplace=True)
    return out

def get_u_test_dict(two_dim_array: list):
    out = dict(p_values=1)
    if sum([sum(i) for i in two_dim_array]) > 0:
        out["p_values"] = stats.mannwhitneyu(*two_dim_array).__getattribute__("pvalue")
    out["is_significant_for_single"] = out["p_values"] < SINGLE_COMPARISON_ALPHA
    return out

def pair_flat_list(list_: list):
    list_ = set(list_)
    out = []
    for pair in sorted(product(list_, list_)):
        append = sorted(set(pair))
        if len(append) > 1 and append not in out:
            out.append(append)
    return out

def get_prevalent_list_number(two_dim_array: list, func):
    """
    :param two_dim_array: 2D array
    :param func: Comparing function, e.g. sum() or numpy.median()
    :return: Number (zero-based) of list with the prevalent result
    """
    results = [func(i) for i in two_dim_array]
    return results.index(max(results))

In [2]:
output_dir = "/data1/bio/projects/tgrigoreva/stool_to_blood"
table_file = os.path.join(output_dir, "stool_blood_paired_only.xlsx")
table_df = pd.read_excel(table_file, encoding="utf-8")
table_df = table_df.loc[:, list(filter(lambda x: len(x.strip()) > 0 and "unnamed" not in x.lower(), 
                                       table_df.columns))]
table_df.set_index("ID", inplace=True)
table_df["age"] = table_df["Диагноз"].apply(lambda x: x.split("_")[0])
table_df["diagnosis"] = table_df["Диагноз"].apply(lambda x: x.split("_")[-1])
taxonomy_columns = [i for i in table_df.columns if i.startswith("k")]
table_df.replace({"ЯК": "colitis", "здоровый": "normal", "ожирение": "obesity", "Ребенок": "child", 
                  "Взрослый": "adult"}, inplace=True)

In [3]:
stool_sum_series = table_df.loc[table_df["type"] == "stool", taxonomy_columns].sum(axis=0)
stool_taxonomy_df = table_df.loc[table_df["type"] == "stool", 
                                 sanitize_series(stool_sum_series).keys()]

blood_sum_series = table_df.loc[table_df["type"] == "blood", taxonomy_columns].sum(axis=0)
blood_taxonomy_df = table_df.loc[table_df["type"] == "blood", 
                                 sanitize_series(blood_sum_series).keys()]

common_taxonomy_df = table_df.loc[:, [i for i in set(stool_taxonomy_df.columns) 
                                      if i in set(blood_taxonomy_df.columns)]]
common_taxonomy_df.columns.name = "taxa"

In [4]:
class DataHolder:
    """
    Holds per-group data
    """
    def __init__(self, indices: list, sample_type: str, diagnosis: str, age: str):
        self.sample_type = sample_type
        self.diagnosis = diagnosis
        self.age = age  
        self.indices = sorted(indices)
        self.taxonomy_df = common_taxonomy_df.loc[self.indices, :].sort_index()
        if self.sample_type.strip() == "stool":
            self.edgecolors = "blue"
        else:
            self.edgecolors = "red"
        if self.diagnosis.strip() == "obesity":
            self.marker = "s"
        elif diagnosis.strip() == "colitis":
            self.marker = "^"
        else:
            self.marker = "o"
        if self.age.strip() == "child":
            self.facecolors = "none"
        else:
            self.facecolors = self.edgecolors
        # More payload
        self.words_dump = []
        for sample_id in self.indices:
            self.words_dump.extend(sanitize_series(
                self.taxonomy_df.loc[[sample_id], :].sum(axis=0)).keys())
        self.median_df = self.series_to_df(self.taxonomy_df.median(axis=0), "taxa", "median")
        self.sum_df = self.series_to_df(self.taxonomy_df.sum(axis=0), "taxa", "sum")

    @staticmethod
    def series_to_df(series: pd.Series, key_column_name: str, value_column_name: str):
        d = sanitize_series(series)
        lst_ = [{key_column_name: k, value_column_name: d.get(k)} for k in d]
        return dict_list_to_dataframe(lst_, key_column_name, value_column_name, sort=True)

    def get_major_df(self, n: int = 50):
        counter = Counter(self.words_dump)
        # [('a', 0), ('b', 1), ('c', 2) ...]
        out_df = pd.DataFrame([{"word": i[0], "word_counts": i[1]} 
                               for i in counter.most_common(n)]).sort_values(
            "word_counts", ascending=False)
        return out_df

    def __str__(self):
        return "[{}]".format(", ".join([self.age, self.diagnosis, self.sample_type]))

    def finalize_df(self, df: pd.DataFrame):
        out_df = df.copy()
        for col_name, value in zip(["age", "diagnosis", "sample_type"], 
                                   [self.age, self.diagnosis, self.sample_type]):
            out_df[col_name] = value
        return out_df
    
    def export(self):
        Utilities.dump_tsv(self.get_major_df(999), os.path.join(
            output_dir, "majors_by_occurrence", "occurrence_only_positive_{}.tsv".format(
                str(self))))
        if len(self.median_df.index) > 0:
            Utilities.dump_tsv(self.median_df.reset_index(), os.path.join(
                output_dir, "majors by median", "positive median values for {}.tsv".format(
                    str(self))))
        if len(self.sum_df.index) > 0:
            Utilities.dump_tsv(self.sum_df.reset_index(), os.path.join(
                output_dir, "majors by sum", "positive sum values for {}.tsv".format(str(self))))


data_holders = []
for diagnosis_ in set(table_df["diagnosis"].values):
    diagnosis_sub_df = table_df.loc[table_df["diagnosis"] == diagnosis_]
    for age_ in set(diagnosis_sub_df["age"].values):
        age_diagnosis_sub_df = diagnosis_sub_df.loc[diagnosis_sub_df["age"] == age_]
        for type_ in set(age_diagnosis_sub_df["type"].values):
            type_age_diagnosis_sub_df = age_diagnosis_sub_df.loc[
                age_diagnosis_sub_df["type"] == type_]
            data_holders.append(DataHolder(indices=type_age_diagnosis_sub_df.index.values,
                                           sample_type=type_, diagnosis=diagnosis_, age=age_))

In [5]:
scaled_common_taxonomy_df = pd.DataFrame(StandardScaler().fit_transform(common_taxonomy_df), 
                                         columns=common_taxonomy_df.columns, 
                                         index=common_taxonomy_df.index)
pca = PCA(n_components=2)
pca_df = pd.DataFrame(pca.fit_transform(scaled_common_taxonomy_df), 
                      columns=["PCA {}".format(i) for i in range(1, 3)], 
                      index=scaled_common_taxonomy_df.index)

plt.rcParams["figure.figsize"] = (28, 20)
fig, ax = plt.subplots()
ax.set_xlabel("Principal Component 1", fontsize = 15)
ax.set_ylabel("Principal Component 2", fontsize = 15)
ax.set_title("2-component PCA", fontsize = 20)

for data_holder in data_holders:
    pca_x = pca_df.loc[data_holder.indices, [pca_df.columns[0]]].values
    pca_y = pca_df.loc[data_holder.indices, [pca_df.columns[1]]].values
    ax.scatter(facecolors=data_holder.facecolors, edgecolors=data_holder.edgecolors, 
               marker=data_holder.marker, label=str(data_holder), x=pca_x, y=pca_y)
    for idx, txt in enumerate(data_holder.indices):
        ax.annotate(txt, (pca_x[idx], pca_y[idx]), fontsize="xx-small")
    data_holder.export()

ax.legend()
ax.grid()

plt.savefig(os.path.join(output_dir, "blood_AND_stool_ONLY_positive_pca.png"), dpi=300)
plt.clf()
plt.close()

In [6]:
SINGLE_COMPARISON_ALPHA = 0.05


class DataHolderPair:
    """
    Holds super-group data as DataHolders with the common diagnosis and age values
    """
    def __init__(self, dh1: DataHolder, dh2: DataHolder):
        self.data_holder_1, self.data_holder_2 = sorted([dh1, dh2], key=lambda x: x.sample_type)
        self.diagnosis = self.data_holder_1.diagnosis
        self.age = self.data_holder_1.age  
        assert len(set(i.diagnosis for i in (self, *self.to_list()))) == 1
        assert len(set(i.age for i in (self, *self.to_list()))) == 1
        assert self.data_holder_1.sample_type != self.data_holder_2.sample_type
        self.indices = tuple([i.indices for i in self.to_list()])
        self.sample_types = tuple([i.sample_type for i in self.to_list()])
        self.median_df = pd.concat([i.median_df.rename(columns={"median": i.sample_type}) 
                                    for i in self.to_list()], axis=1, sort=False).fillna(0)
        self.median_df.index.names = ["taxa"]
        self.median_df.columns.names = ["sample_type"]
        self.sum_df = pd.concat([i.sum_df.rename(columns={"sum": i.sample_type}) 
                                 for i in self.to_list()], axis=1, sort=False).fillna(0)
        self.sum_df.index.names = ["taxa"]
        self.sum_df.columns.names = ["sample_type"]
        medians = [i.median_df["median"].sum() for i in self.to_list()]
        sums = [i.sum_df["sum"].sum() for i in self.to_list()]
        self.u_test_df = pd.DataFrame()
        self._make_u_test()
        self.u_test_df["prevalent_by_median"] = self.sample_types[medians.index(max(medians))]
        self.u_test_df["prevalent_by_sum"] = self.sample_types[sums.index(max(sums))]

    def _make_u_test(self):
        u_test_dicts = []
        for taxon in common_taxonomy_df.columns:
            taxon_values = [i.taxonomy_df.loc[:, taxon].values for i in self.to_list()]
            u_test_dict = get_u_test_dict(taxon_values)
            u_test_dict["taxa"] = taxon
            u_test_dicts.append(u_test_dict)
        self.u_test_df = dict_list_to_dataframe(u_test_dicts, "taxa", "p_values")

    def to_list(self):
        return [self.data_holder_1, self.data_holder_2]
    
    def export(self):
        out_dir = os.path.join(output_dir, "paired_comparisons", str(self))
        Utilities.dump_tsv(self.median_df.reset_index(), 
                           os.path.join(out_dir, "raw medians for {}.tsv".format(str(self))))
        Utilities.dump_tsv(self.sum_df.reset_index(), 
                           os.path.join(out_dir, "raw sums for {}.tsv".format(str(self))))
        Utilities.dump_tsv(self.u_test_df.reset_index(), 
                           os.path.join(out_dir, "single u-test for {}.tsv".format(str(self))))
        

    def __eq__(self, other):
      return other and all([i == j for i, j in zip(self.to_list(), other.to_list())])
    
    def __ne__(self, other):
        return not self.__eq__(other)
    
    def __hash__(self):
      return hash(self.to_list())
    
    def __str__(self):
        return " vs ".join([str(i) for i in self.to_list()])


data_holder_pairs = []
for data_holder_pair in [sorted([i, j], key=lambda x: x.sample_type) 
                         for i in data_holders for j in data_holders 
                            if i.diagnosis == j.diagnosis and i.age == j.age 
                            and i.sample_type != j.sample_type]:
    if data_holder_pair not in [i.to_list() for i in data_holder_pairs]:
        data_holder_pairs.append(DataHolderPair(*data_holder_pair))

In [7]:
for data_holder_pair in data_holder_pairs:
    data_holder_pair.export()
    for data_type, data_df in zip(["median", "sum"], 
                                  [data_holder_pair.median_df, data_holder_pair.sum_df]):
        pair_correlation_df = data_holder_pair.sum_df.transpose().corr(method="spearman")
        pair_correlation_df = pair_correlation_df.sort_values(pair_correlation_df.columns[-1], 
                                                              axis=0).sort_values(
            pair_correlation_df.index[-1], axis=1)
        pair_correlation_title = "Correlation between {} values for {}".format(
            data_type, str(data_holder_pair))
        pair_correlation_prefix = os.path.join(output_dir, "correlations", 
                                               str(data_holder_pair), pair_correlation_title)
        Utilities.dump_tsv(pair_correlation_df.reset_index(), 
                           "{}.tsv".format(pair_correlation_prefix))
        sns.set()
        plt.rcParams["figure.figsize"] = (20, 20)
        ax = sns.heatmap(pair_correlation_df)
        _ = ax.set_title(pair_correlation_title, fontsize = 20)
        ax.tick_params(labelsize=3)
        plt.tight_layout()
        plt.savefig("{}.png".format(pair_correlation_prefix), dpi=300)
        plt.clf()
        plt.close()

In [8]:
PATIENT_ID_COL_NAME = "N_human"
PATIENT_GROUP_PROPS = ("diagnosis", "age")

distance_list = []
for patient_id in set(table_df[PATIENT_ID_COL_NAME].values.tolist()):
    patent_sub_df = table_df.loc[table_df[PATIENT_ID_COL_NAME] == patient_id, 
                                 common_taxonomy_df.columns]
    assert len(patent_sub_df.index.values) == 2
    distance_dict = dict(N_human=patient_id, samples=",".join(patent_sub_df.index.values), 
                         bray_curtis_dissimilarity=distance.braycurtis(*patent_sub_df.values), 
                         manhattan_distance=distance.cityblock(*patent_sub_df.values), 
                         euclidean_distance=distance.euclidean(*patent_sub_df.values))
    for patient_group_property in PATIENT_GROUP_PROPS:
        distance_dict[patient_group_property] = table_df.loc[
            table_df[PATIENT_ID_COL_NAME] == patient_id, patient_group_property].values[0]
    distance_list.append(distance_dict)
distance_df = dict_list_to_dataframe(distance_list, PATIENT_ID_COL_NAME, "samples")

distances_dir = os.path.join(output_dir, "distances")
Utilities.dump_tsv(distance_df, os.path.join(distances_dir, "distances.tsv"))

In [9]:
DISTANCE_COLUMN_NAMES = ("bray_curtis_dissimilarity", "manhattan_distance", "euclidean_distance")

for distance_column_name in DISTANCE_COLUMN_NAMES:
    sns.set()
    plt.rcParams["figure.figsize"] = (10, 10)
    # ax = sns.boxplot(data=distance_df, x="diagnosis", y=distance_column_name, hue="age")
    ax = sns.boxplot(data=distance_df, x="age", y=distance_column_name, hue="diagnosis")
    distance_boxplot_title = "{} between {} samples per patient".format(
        distance_column_name, " and ".join( set(table_df["type"].values)))
    _ = ax.set_title(distance_boxplot_title, fontsize = 20)
    plt.tight_layout()
    plt.savefig("{}.png".format(os.path.join(distances_dir, distance_boxplot_title)), dpi=300)
    plt.clf()
    plt.close()


In [10]:
diagnosis_pairs = pair_flat_list(distance_df["diagnosis"].values)
for distance_column_name in DISTANCE_COLUMN_NAMES:
    for age_group in set(distance_df["age"].values):
        diagnosis_by_distance_comparison_list = []
        for diagnosis_pair in diagnosis_pairs:
            diagnosis_pair_dict = dict(age=age_group, diagnoses=" vs ".join(diagnosis_pair))
            diagnosis_pair_dict.update(
                get_u_test_dict(
                    [distance_df.loc[(distance_df["age"] == age_group) & (
                            distance_df["diagnosis"] == i)][distance_column_name].values 
                     for i in diagnosis_pair]))
            diagnosis_by_distance_comparison_list.append(diagnosis_pair_dict)
        diagnosis_by_distance_comparison_df = dict_list_to_dataframe(
            diagnosis_by_distance_comparison_list, "diagnoses", "age", sort=True)
        diagnosis_by_distance_comparison_df["is_significant_for_multi"] = multitest.multipletests(
            diagnosis_by_distance_comparison_df["p_values"].values, alpha=SINGLE_COMPARISON_ALPHA, 
            method="fdr_bh")[0]
        Utilities.dump_tsv(diagnosis_by_distance_comparison_df.reset_index(), os.path.join(
            distances_dir, "comparisons", "diagnosis-based comparison of {} for {}.tsv".format(
                distance_column_name, age_group)))