In [1]:
import os
import re
import numpy as np
import pandas as pd
import xlrd
import seaborn as sns
from scipy import stats
from matplotlib import pyplot as plt
from meta.scripts.Utilities import Utilities

In [2]:
def process_sample_number(s: str):
    d = dict()
    d["sample_number"], group_code = [i.strip() for i in s.split("/")]
    if group_code[0] == "В":
        d["age"] = "adult"
    if group_code[1] == "О":
        d["group"] = "obesity"
    elif group_code[1] == "З":
        d["group"] = "normal"
    return pd.Series(d)


def fix_exponentials(s: str):
    s = str(s).strip().lower().replace(",", ".")
    try:
        return float(s)
    except ValueError:
        base, power = [float(i) for i in re.split("[^0-9.]+", s)]
        f = base * (10 ** power)
        if "-" in s:
            f = base / (10 ** power)
        return f


def get_statistical_test_series(two_dim_array: list, func=stats.mannwhitneyu, alpha: float = 0.05):
    # Use with the 'stats' package only!
    p_value = 1
    if sum(map(sum, two_dim_array)) > 0:
        p_value = func(*two_dim_array).__getattribute__("pvalue")
    d = dict(p_value=p_value, is_significant_for_single_comparison=p_value < alpha)
    return pd.Series(d)

In [3]:
PROJECT_DIR = "/data1/bio/projects/ashestopalov/nutrition/obesity_elisa"
COMMON_CLINICAL_DATA_COL_NAMES = {
    "имт": "body_mass_index", "от": "waist_circumference", "глюкоза": "glucose", 
    "лпвп": "high-density_lipoproteins", "тг": "thyroglobulin", 
    "ох": "total_blood_cholesterol", "лпнп": "low-density_lipoproteins", 
    "сад": "systolic_blood_pressure", "дад": "diastolic_blood_pressure"
}
raw_data_dir = os.path.join(PROJECT_DIR, "raw")
correlation_dir = os.path.join(PROJECT_DIR, "correlation")

whole_raw_df = pd.DataFrame()
for raw_table_file in Utilities.scan_whole_dir(raw_data_dir):
    if "obschaya" in raw_table_file and "kontrol'" not in raw_table_file:
        continue
    raw_table_df = pd.read_excel(raw_table_file, encoding="utf-8").dropna(how="all")
    raw_table_df = raw_table_df.rename(columns={"охс": "ох"}).rename(
        columns=COMMON_CLINICAL_DATA_COL_NAMES)
    raw_table_df = pd.concat([raw_table_df, raw_table_df["№ образца"].apply(process_sample_number)],
                             axis=1, sort=False)
    if "MZO" in raw_table_file:
        raw_table_df["subgroup"] = "metabolically-healthy_obesity"
    elif "MNZ" in raw_table_file:
        raw_table_df["subgroup"] = "metabolically-pathological_obesity"
    elif "wzroslye s ozhireniem'" in raw_table_file:
        raw_table_df["group"] = "obesity"
    elif "kontrol'" in raw_table_file:
        raw_table_df["group"] = "normal"
    raw_table_df["gender"] = raw_table_df["пол"].apply(
        lambda x: ["male", "female"][x.strip().lower() == "ж"])
    raw_table_df = raw_table_df.set_index("sample_number")
    whole_raw_df = pd.concat([whole_raw_df, raw_table_df], axis=0, sort=True).rename_axis(
        index="sample_number", columns="multiplex_value")

raw_clinical_data_col_names = sorted(COMMON_CLINICAL_DATA_COL_NAMES.values())
raw_multiplex_data_col_names = [i for i in whole_raw_df.columns if i[0].isupper()]
raw_value_col_names = raw_multiplex_data_col_names + raw_clinical_data_col_names

whole_raw_df.loc[:, raw_value_col_names] = whole_raw_df.loc[:, raw_value_col_names].replace(
    [np.inf,-np.inf], np.nan).fillna(0).applymap(fix_exponentials)

group_names = sorted(set(whole_raw_df["group"].values))
obesity_subgroup_names = sorted(set(whole_raw_df["subgroup"].dropna().values))

In [4]:
def prepare_df(data: pd.DataFrame, value_col_names: list):
    common_col_names = np.intersect1d(data.columns, value_col_names)
    return data.loc[:, common_col_names].replace(0, np.nan, regex=True).dropna(
        axis=1, how="all").fillna(0)

def export_plot(basename: str):
    plt.tight_layout()
    os.makedirs(os.path.dirname(basename), exist_ok=True)
    plt.savefig("{}.pdf".format(basename), dpi=600)
    plt.clf()
    plt.close()

def build_clustermap(data: pd.DataFrame, value_col_names: list, group: str, out_dir: str):
    title = "Correlation between clinical and ELISA data for {}".format(group)
    export_prefix = os.path.join(out_dir, title)
    _data = prepare_df(data, value_col_names)
    correlation_df = _data.corr(method="spearman")
    Utilities.dump_tsv(correlation_df, "{}.tsv".format(export_prefix))
    sns.set()
    plt.rcParams["figure.figsize"] = (10, 10)
    _ = sns.clustermap(correlation_df, metric="cityblock", cmap="plasma").fig.suptitle(
        title, fontsize=10,y=0.995) 
    export_plot(export_prefix)

def build_pairplot(data: pd.DataFrame, value_col_names: list, group: str, out_dir: str):
    title = "Pair plot for clinical and ELISA data for {}".format(group)
    export_prefix = os.path.join(out_dir, title)
    _data = prepare_df(data, value_col_names)
    Utilities.dump_tsv(_data, "{}.tsv".format(export_prefix))
    sns.set()
    plt.rcParams["figure.figsize"] = (20, 20)  
    _ = sns.pairplot(data=_data, kind="reg", vars=_data.columns).fig.suptitle(
        title, fontsize=10, y=0.997)
    export_plot(export_prefix)

In [5]:
for group_name in group_names:
    group_df = whole_raw_df.query("group == '{}'".format(group_name))
    build_clustermap(data=group_df, value_col_names=raw_value_col_names, out_dir=correlation_dir, 
                     group=group_name)
    build_pairplot(data=group_df, value_col_names=raw_value_col_names, out_dir=correlation_dir, 
                   group=group_name)

In [6]:
for obesity_subgroup_name in obesity_subgroup_names:
    obesity_subgroup_df = whole_raw_df.query("subgroup == '{}'".format(obesity_subgroup_name))
    build_clustermap(data=obesity_subgroup_df, value_col_names=raw_value_col_names, 
                     out_dir=correlation_dir, group=obesity_subgroup_name)
    build_pairplot(data=obesity_subgroup_df, value_col_names=raw_value_col_names, 
                   out_dir=correlation_dir, group=obesity_subgroup_name)