In [1]:
import os
import xlrd
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from collections import Counter
os.chdir("..")
from meta.scripts.Utilities import Utilities

In [2]:
output_dir = "/data1/bio/projects/tgrigoreva/stool_to_blood"
table_file = os.path.join(output_dir, "stool_blood_paired_only.xlsx")
table_df = pd.read_excel(table_file, encoding="utf-8")
table_df = table_df.loc[:, list(filter(lambda x: len(x.strip()) > 0 and "unnamed" not in x.lower(), table_df.columns))]
table_df.set_index("ID", inplace=True)
table_df["Age"] = table_df["Диагноз"].apply(lambda x: x.split("_")[0])
table_df["Diagnosis"] = table_df["Диагноз"].apply(lambda x: x.split("_")[-1])
taxonomy_columns = [i for i in table_df.columns if i.startswith("k")]
table_df.replace({"ЯК": "colitis", "здоровый": "healthy", "ожирение": "obesity", 
                  "Ребенок": "child", "Взрослый": "adult"}, inplace=True)

In [3]:
def sanitize_series(s: pd.Series):
    return {i: s.get(i) for i in s.keys() if s.get(i) > 0}


stool_sum_series = table_df.loc[table_df["type"] == "stool", taxonomy_columns].sum(axis=0)
stool_taxonomy_df = table_df.loc[table_df["type"] == "stool", sanitize_series(stool_sum_series).keys()]

blood_sum_series = table_df.loc[table_df["type"] == "blood", taxonomy_columns].sum(axis=0)
blood_taxonomy_df = table_df.loc[table_df["type"] == "blood", sanitize_series(blood_sum_series).keys()]

common_taxonomy_columns = [i for i in set(stool_taxonomy_df.columns) if i in set(blood_taxonomy_df.columns)]
common_taxonomy_df = table_df.loc[:, common_taxonomy_columns]

In [4]:
scaled_common_taxonomy_df = pd.DataFrame(StandardScaler().fit_transform(common_taxonomy_df), 
                                         columns=common_taxonomy_df.columns, index=common_taxonomy_df.index)
pca = PCA(n_components=2)
pca_df = pd.DataFrame(pca.fit_transform(scaled_common_taxonomy_df), columns=["PCA {}".format(i) for i in range(1, 3)], 
                      index=scaled_common_taxonomy_df.index)

In [5]:
# print(set(table_df["type"].values), set(table_df["Diagnosis"].values), set(table_df["Age"].values))

class DataHolder:
    """
    Holds per-group data
    """
    def __init__(self, indices: list, sample_type: str, diagnosis: str, age: str):
        self.sample_type = sample_type
        self.diagnosis = diagnosis
        self.age = age  
        self.indices = sorted(indices)
        if self.sample_type.strip() == "stool":
            self.edgecolors = "blue"
        else:
            self.edgecolors = "red"
        if self.diagnosis.strip() == "obesity":
            self.marker = "s"
        elif diagnosis.strip() == "colitis":
            self.marker = "^"
        else:
            self.marker = "o"
        if self.age.strip() == "child":
            self.facecolors = "none"
        else:
            self.facecolors = self.edgecolors
        # More payload
        self.words_dump = []
        self.median_data = []
        self.sum_data = []

    def get_group_name(self):
        return ", ".join([self.age, self.diagnosis, self.sample_type])

    def get_major_df(self, n: int = 50):
        counter = Counter(self.words_dump)
        # [('a', 0), ('b', 1), ('c', 2) ...]
        out_df = pd.DataFrame([{"taxa": i[0], "occurrences_per_group": i[1]} for i in counter.most_common(n)])
        return out_df
    
    def finalize_df(self, df: pd.DataFrame):
        out_df = df.copy()
        for col_name, value in zip(["age", "diagnosis", "sample_type"], [self.age, self.diagnosis, self.sample_type]):
            out_df[col_name] = value
        return out_df


data_holders = []
for diagnosis_ in set(table_df["Diagnosis"].values):
    diagnosis_sub_df = table_df.loc[table_df["Diagnosis"] == diagnosis_]
    for age_ in set(diagnosis_sub_df["Age"].values):
        age_diagnosis_sub_df = diagnosis_sub_df.loc[diagnosis_sub_df["Age"] == age_]
        for type_ in set(age_diagnosis_sub_df["type"].values):
            type_age_diagnosis_sub_df = age_diagnosis_sub_df.loc[age_diagnosis_sub_df["type"] == type_]
            data_holders.append(DataHolder(indices=type_age_diagnosis_sub_df.index.values,
                                           sample_type=type_, diagnosis=diagnosis_, age=age_))

In [6]:
plt.rcParams["figure.figsize"] = (28, 20)
fig, ax = plt.subplots()
ax.set_xlabel("Principal Component 1", fontsize = 15)
ax.set_ylabel("Principal Component 2", fontsize = 15)
ax.set_title("2-component PCA", fontsize = 20)

for data_holder in data_holders:
    pca_x = pca_df.loc[data_holder.indices, [pca_df.columns[0]]].values
    pca_y = pca_df.loc[data_holder.indices, [pca_df.columns[1]]].values
    ax.scatter(facecolors=data_holder.facecolors, edgecolors=data_holder.edgecolors, marker=data_holder.marker, 
               label=data_holder.get_group_name(), x=pca_x, y=pca_y)
    for idx, txt in enumerate(data_holder.indices):
        ax.annotate(txt, (pca_x[idx], pca_y[idx]), fontsize="xx-small")

ax.legend()
ax.grid()

plt.savefig(os.path.join(output_dir, "blood_AND_stool_ONLY_positive_pca.png"), dpi=300)
plt.clf()
plt.close()

In [7]:
for data_holder in data_holders:
    for common_taxonomy_idx in data_holder.indices:
        data_holder.words_dump.extend(
            sanitize_series(common_taxonomy_df.loc[[common_taxonomy_idx], :].sum(axis=0)).keys())
        Utilities.dump_tsv(data_holder.get_major_df(999), 
                           os.path.join(output_dir, "majors_by_occurrence", 
                                        "occurrence_only_positive_{}.tsv".format(data_holder.get_group_name())))

In [8]:
for data_holder in data_holders:
    data_holder.median_data = sanitize_series(common_taxonomy_df.loc[data_holder.indices, :].median(axis=0))
    Utilities.dump_tsv(pd.DataFrame([{"taxa": k, "median": data_holder.median_data.get(k)} 
                                     for k in data_holder.median_data]).sort_values("median", ascending=False),
                       os.path.join(output_dir, "majors_by_median", 
                                    "median_only_positive_{}.tsv".format(data_holder.get_group_name())))
    data_holder.sum_data = sanitize_series(common_taxonomy_df.loc[data_holder.indices, :].sum(axis=0))
    Utilities.dump_tsv(pd.DataFrame([{"taxa": k, "sum": data_holder.sum_data.get(k)} 
                                     for k in data_holder.sum_data]).sort_values("sum", ascending=False),
                       os.path.join(output_dir, "majors_by_sum", 
                                    "sum_only_positive_{}.tsv".format(data_holder.get_group_name())))