# Pankreopriver diabetes mellitus vs Diabetes Type 1 

### Libraries

In [None]:
# pd.options.display.max_columns= 999

In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests

In [None]:
from skbio.stats.ordination import pcoa

### Data loading

In [None]:
df = pd.read_csv(
    "/data/projects/2024/Effenberger-Diabetes/data/PDM merged 3.0_modified.csv"
)

In [None]:
sample_info = pd.read_csv(
    "/data/projects/2024/Effenberger-Diabetes/data/20011/20011_SampleInfo.csv"
)

In [None]:
df.rename(columns={"Probennummer": "sample_information"}, inplace=True)

In [None]:
sample_info.rename(columns={"SampleInformation": "sample_information"}, inplace=True)

In [None]:
df = df.merge(
    sample_info[["sample_information", "IMGM ID", "Type"]],
    on="sample_information",
    how="left",
)

In [None]:
# Load the dataset
file_path = "/data/projects/2024/Effenberger-Diabetes/out/nf_core_ampliseq_003/qiime2/barplot/level-6.csv"
df_tax = pd.read_csv(file_path)

In [None]:
df_tax.set_index("index", inplace=True)

In [None]:
exclude_cols = [
    'sample_information', 'age', 'KHK1', 'KHK2', 'CA1', 'CA2',
    'HbA1C (DCCT/NGSP)1', 'HbA1C (DCCT/NGSP)2', 'Glukose1', 'Glukose2',
    'BMI1', 'BMI2', 'Pankreatektomie', 'HbA1C_diff', 'Glukose_diff',
    'BMI_diff', 'KHK_diff', 'CA_diff'
]

df_tax_bacteria = df_tax.drop(columns=exclude_cols, errors='ignore')

threshold = 0.1

mean_abundance = df_tax_bacteria.mean(axis=0)

low_abundance_taxa = mean_abundance[mean_abundance < threshold].index

df_low_abundance = df_tax_bacteria[low_abundance_taxa]

high_abundance_taxa = mean_abundance[mean_abundance >= threshold].index

df_high_abundance = df_tax_bacteria[high_abundance_taxa]

In [None]:
file_path = "/data/projects/2024/Effenberger-Diabetes/out/nf_core_ampliseq_003/qiime2/diversity/alpha_diversity/shannon_vector/metadata.tsv"
shannon_entropy = pd.read_csv(file_path, sep="\t")
shannon_entropy = shannon_entropy.iloc[1:]

In [None]:
file_path = "/data/projects/2024/Effenberger-Diabetes/out/nf_core_ampliseq_003/qiime2/diversity/beta_diversity/bray_curtis_distance_matrix-condition/raw_data.tsv"
bray_curtis = pd.read_csv(file_path, sep="\t")
bray_curtis = bray_curtis.iloc[:, 1:]

In [None]:
file_path = "/data/projects/2024/Effenberger-Diabetes/out/nf_core_ampliseq_003/qiime2/diversity/beta_diversity/jaccard_distance_matrix-condition/raw_data.tsv"
jaccard = pd.read_csv(file_path, sep="\t")
jaccard = jaccard.iloc[:, 1:]

In [None]:
jaccard.head()

In [None]:
bray_curtis.head()

### Data cleaning

#### Metadata contains clinical information

In [None]:
metadata_cols = df[
    [
        "IMGM ID",
        "sample_information",
        "Type",
        "age",
        "KHK1",
        "KHK2",
        "CA1",
        "CA2",
        "HbA1C (DCCT/NGSP)1",
        "HbA1C (DCCT/NGSP)2",
        "Glukose1",
        "Glukose2",
        "BMI1",
        "BMI2",
       "Pankreatektomie",'sex','Insulin1','Insulin2', 'MASLD1','MASLD2', 'nikotin',
    ]
]

In [None]:
metadata = metadata_cols.dropna(subset=["IMGM ID"])

In [None]:
metadata["HbA1C_diff"] = metadata.apply(
    lambda x: (
        "increase"
        if x["HbA1C (DCCT/NGSP)2"] - x["HbA1C (DCCT/NGSP)1"] > 0
        else "decrease"
    ),
    axis=1,
)
metadata["Glukose_diff"] = metadata.apply(
    lambda x: "increase" if x["Glukose2"] - x["Glukose1"] > 0 else "decrease", axis=1
)
metadata["BMI_diff"] = metadata.apply(
    lambda x: "increase" if x["BMI2"] - x["BMI1"] > 0 else "decrease", axis=1
)

In [None]:
def categorize_diff(before, after):
    if after == "ja" and before == "nein":
        return "onset"
    elif after == "nein" and before == "nein":
        return "absent"
    elif after == "nein" and before == "ja":
        return "resolved"
    elif after == "ja" and before == "ja":
        return "persistent"
    else:
        return "unknown" 


metadata["KHK_diff"] = metadata.apply(
    lambda x: categorize_diff(x["KHK1"], x["KHK2"]), axis=1
)
metadata["CA_diff"] = metadata.apply(
    lambda x: categorize_diff(x["CA1"], x["CA2"]), axis=1
)

In [None]:
metadata.rename(columns={"IMGM ID": "id"}, inplace=True)

In [None]:

metadata_k = metadata[metadata["sample_information"].str.contains("K", na=False)]
metadata_dm = metadata[metadata["sample_information"].str.match("DM", na=False)]
metadata_pdm = metadata[metadata["sample_information"].str.contains("PDM", na=False)]

In [None]:
metadata_k = metadata_k.drop_duplicates(subset=["id"], keep="first")
metadata_dm = metadata_dm.drop_duplicates(subset=["id"], keep="first")
metadata_pdm = metadata_pdm.drop_duplicates(subset=["id"], keep="first")

In [None]:
metadata = metadata.drop_duplicates(subset=["id"], keep="first")

#### Microbial data contains taxonomic information from QUIIME

In [None]:
microbial_data = df_high_abundance.drop(columns=metadata_cols, errors="ignore")

In [None]:
microbial_data["id"] = microbial_data.index

In [None]:
def extract_species_name(taxonomy):
    """Extract the last part of a taxonomy string (genus name)."""
    return taxonomy.split(";")[-1].strip()

In [None]:
microbial_data.rename(
    columns={col: extract_species_name(col) for col in microbial_data.columns},
    inplace=True,
)

In [None]:
microbial_data.rename(columns={"index": "id"}, inplace=True)

In [None]:
microbial_data.columns = microbial_data.columns.str.strip()
metadata.columns = metadata.columns.str.strip()

print("microbial_data columns:", microbial_data.columns)
print("metadata columns:", metadata.columns)

if "id" in microbial_data.columns and "id" in metadata.columns:
    microbial_data = microbial_data.merge(
        metadata[["id", "Type", "sample_information","age",'KHK1', 'KHK2', 'CA1', 'CA2',
       'HbA1C (DCCT/NGSP)1', 'HbA1C (DCCT/NGSP)2', 'Glukose1', 'Glukose2','Pankreatektomie',
       'BMI1', 'BMI2']], on="id", how="left"
    )
    

else:
    print("'id' column not found in one or both DataFrames.")

In [None]:
ordinal_map = {
    "nein": 0,
    "Teilresektion links": 1,
    "Teilresektion rechts": 2,
    "Resektion": 3
}

microbial_data["Pankreatektomie_encoded"] = microbial_data["Pankreatektomie"].map(ordinal_map)

In [None]:
microbial_data_original = microbial_data.copy()

In [None]:
microbial_data = microbial_data.drop(columns=['Pankreatektomie'])

#### Boxplot - Alpha diversity - Shannon entropy

In [None]:
shannon_entropy = shannon_entropy.merge(
    metadata[["id", "Type", "sample_information"]], on="id", how="left"
)

In [None]:
shannon_entropy.head()

In [None]:
shannon_entropy["Type"].replace("Diabetes mellitus Typ1", "DM", inplace=True)
shannon_entropy["Type"].replace("pankreopriver Diabetes", "PDM", inplace=True)

In [None]:
shannon_entropy["Type"] = shannon_entropy["Type"].replace({"Kontrolle": "K"})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import pandas as pd

order = ["K", "DM", "PDM"]
custom_palette = {
    "K": "#1f77b4",
    "DM": "#ff7f0e",
    "PDM": "#2ca02c",
}

df_clean = shannon_entropy[
    pd.to_numeric(shannon_entropy["shannon_entropy"], errors="coerce").notnull()
].copy()
df_clean["shannon_entropy"] = df_clean["shannon_entropy"].astype(float)

plt.figure(figsize=(2.6, 4.5))
ax = sns.boxplot(
    data=df_clean, x="Type", y="shannon_entropy", palette=custom_palette, order=order
)
plt.xticks(rotation=90)
plt.xlabel("")
plt.ylabel("Shannon Entropy")

comparisons = [("K", "DM"), ("K", "PDM"), ("PDM", "DM")]
p_values = []

for group1, group2 in comparisons:
    y1 = df_clean[df_clean["Type"] == group1]["shannon_entropy"]
    y2 = df_clean[df_clean["Type"] == group2]["shannon_entropy"]
    stat, p = mannwhitneyu(y1, y2, alternative="two-sided")
    p_values.append(p)

padj = multipletests(p_values, method='fdr_bh')[1]


y_max = df_clean["shannon_entropy"].max()
h = 0.2  
for i, ((group1, group2), p) in enumerate(zip(comparisons, p_values)):
    x1, x2 = order.index(group1), order.index(group2)

   
    if p < 0.001:
        label = "***"
    elif p < 0.01:
        label = "**"
    elif p < 0.05:
        label = "*"
    else:
        label = ""

   
    y_line = y_max + h * i
    ax.plot(
        [x1, x1, x2, x2],
        [y_line, y_line + 0.1, y_line + 0.1, y_line],
        lw=0.7,
        c="black",
    )
    ax.text((x1 + x2) / 2, y_line + 0.1, label, ha="center", va="bottom", fontsize=8)

plt.tight_layout()

#plt.savefig("/data/scratch/kvalem/projects/2024/diabetes_microbe/05-results/figures/alpha_shannon_pvalue.svg")
#plt.savefig("/data/scratch/kvalem/projects/2024/diabetes_microbe/05-results/figures/alpha_shannon_pvalue.png")
plt.show()


#### Boxplot - Alpha diversity - Simpson diversity

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

simpson = pd.read_csv('/data/projects/2024/Effenberger-Diabetes/out/nf_core_ampliseq_003/qiime2/abundance_tables/exported_simpson/alpha-diversity.tsv', sep='\t', index_col=0)


In [None]:
simpson.index.name = 'sample_information'

In [None]:
simpson["id"] = simpson.index

In [None]:
merged = pd.merge(simpson, metadata[['id', 'Type']], on='id')

In [None]:
merged["Type"].replace("Diabetes mellitus Typ1", "DM",inplace=True)
merged["Type"].replace("pankreopriver Diabetes", "PDM",inplace=True)
merged["Type"].replace("Kontrolle","K",inplace=True)
merged["Type"].replace("Diabetes mellitus Typ1", "DM",inplace=True)
merged["Type"].replace("pankreopriver Diabetes", "PDM",inplace=True)
merged["Type"].replace("Kontrolle","K",inplace=True)

In [None]:
order = ["H", "T1DM", "T3cDM"]
custom_palette = {
    "H": "#1f77b4",
    "T1DM": "#ff7f0e",
    "T3cDM": "#2ca02c",
}

merged["Type"] = merged["Type"].replace({
    "K": "H",
    "DM": "T1DM",
    "PDM": "T3cDM"
})

plt.figure(figsize=(2.6, 4.5))
ax = sns.boxplot(
    data=merged, x="Type", y="simpsons", palette=custom_palette, order=order
)
plt.xticks(rotation=90)
plt.xlabel("")
plt.ylabel("Simpson diversity")

comparisons = [("H", "T1DM"), ("H", "T3cDM"), ("T3cDM", "T1DM")]
y_max = merged["simpsons"].max()
h = 0.2  

for i, (group1, group2) in enumerate(comparisons):
    x1, x2 = order.index(group1), order.index(group2)
    y1 = merged[merged["Type"] == group1]["simpsons"]
    y2 = merged[merged["Type"] == group2]["simpsons"]

    stat, p = mannwhitneyu(y1, y2, alternative="two-sided")

    if p < 0.001:
        star =  "***"
    elif p < 0.01:
        star =  "**"
    elif p < 0.05:
        star = "*"
    else:
        star =  ""

    y_line = y_max + h * i
    ax.plot(
        [x1, x1, x2, x2],
        [y_line, y_line + 0.1, y_line + 0.1, y_line],
        lw=0.7,
        c="black",
    )
    ax.text((x1 + x2) / 2, y_line + 0.1, star, ha="center", va="bottom", fontsize=8)


#plt.savefig("/data/scratch/kvalem/projects/2024/diabetes_microbe/05-results/figures/alpha_simpson_pvalue.svg")
#plt.savefig("/data/scratch/kvalem/projects/2024/diabetes_microbe/05-results/figures/alpha_simpson_pvalue.png")

plt.tight_layout()
plt.show()


#### Boxplot - Alpha diversity - Chao1 index

In [None]:
import pandas as pd
from skbio.diversity.alpha import chao1

In [None]:
otu_table = pd.read_csv("/data/projects/2024/Effenberger-Diabetes/out/nf_core_ampliseq_003/dada2/ASV_table.tsv", sep = "\t")

In [None]:
otu_table.set_index("ASV_ID", inplace = True)

In [None]:
otu_table = otu_table.T

In [None]:
chao1_df = otu_table.apply(chao1, axis=1)

In [None]:
chao1_df = pd.DataFrame({
    "id": chao1_df.index,
    "chao1": chao1_df.values,
})

In [None]:
df_clean = chao1_df[
    pd.to_numeric(chao1_df["chao1"], errors="coerce").notnull()
].copy()

In [None]:
df_clean["chao1"] = df_clean["chao1"].astype(float)


In [None]:
order = ["K", "DM", "PDM"]
custom_palette = {
    "K": "#1f77b4",
    "DM": "#ff7f0e",
    "PDM": "#2ca02c",
}

In [None]:
df_clean = df_clean.merge(
    microbial_data[["id", "Type"]],
    on="id",
    how="left"
)


In [None]:
df_clean["Type"] = df_clean["Type"].replace(
    {"pankreopriver Diabetes": "PDM", "Diabetes mellitus Typ1": "DM", "Kontrolle": "K"}
)


In [None]:
order = ["H", "T1DM", "T3cDM"]
custom_palette = {
    "H": "#1f77b4",
    "T1DM": "#ff7f0e",
    "T3cDM": "#2ca02c",
}


df_clean["Type"] = df_clean["Type"].replace({
    "K": "H",
    "DM": "T1DM",
    "PDM": "T3cDM"
})

plt.figure(figsize=(2.6, 4.5))

ax = sns.boxplot(
    data=df_clean, x="Type", y="chao1", palette=custom_palette, order=order
)
plt.xticks(rotation=90)
plt.xlabel("")
plt.ylabel("Chao1 Diversity")


comparisons = [("H", "T1DM"), ("H", "T3cDM"), ("T3cDM", "T1DM")]
y_max = df_clean["chao1"].max()
h = 10  

for i, (group1, group2) in enumerate(comparisons):
    x1, x2 = order.index(group1), order.index(group2)
    y1 = df_clean[df_clean["Type"] == group1]["chao1"]
    y2 = df_clean[df_clean["Type"] == group2]["chao1"]

    stat, p = mannwhitneyu(y1, y2, alternative="two-sided")
    print(p)

  
    if p < 0.001:
        star = "***"
    elif p < 0.01:
        star = "**"
    elif p < 0.05:
        star = "*"
    else:
        star = ""

    
    y_line = y_max + h * i
    ax.plot([x1, x1, x2, x2], [y_line, y_line + 2, y_line + 2, y_line], lw=0.7, c="black")
    ax.text((x1 + x2) / 2, y_line + 2.5, star, ha="center", va="bottom", fontsize=8)

plt.tight_layout()

#plt.savefig("/data/scratch/kvalem/projects/2024/diabetes_microbe/05-results/figures/alpha_chao1_pvalue.svg.svg")
#plt.savefig("/data/scratch/kvalem/projects/2024/diabetes_microbe/05-results/figures/alpha_chao1_pvalue.svg.png")
plt.show()
