# Boxplots

(c) 2023-2024 Nick Falk, Rob Edwards

These boxplots are based on Nick's boxplot data, but this uses the "new" format taxonomy data that is the same for MGI and MinION


In [6]:
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.stats.multitest import multipletests

In [7]:
from cf_analysis_lib import read_taxonomy, read_metadata

In [8]:
datadir = ".."
sequence_type = "MGI"
taxa = "genus"
genus_otu = read_taxonomy(datadir, sequence_type, taxa)
genus_otu.head(3)

In [9]:
# Normalization/Transformation
# Square root transform
sqrt_genus = np.sqrt(genus_otu)

# Transpose the taxa table
genus_otu2 = sqrt_genus.T
genus_otu2.head()

In [12]:
sequence_type = 'MGI'
metadata = read_metadata(datadir, sequence_type)
metadata.head(3)

In [14]:
# Merge the taxa table and metadata table by the shared row names
merged1 = pd.merge(metadata.rename(columns={'Pseudomonas':'Pseudomonas culturing'}), genus_otu2, left_index=True, right_index=True)
merged1.head()

In [15]:
# Define the culturing
culture_columns = [
    'CS_Pseudomonas aeruginosa',
    'CS_Stenophotomonas maltophilia',
    'NTM',
    'CS_Mycobacteroides abscessus',
    'CS_Mycobacterium intracellulare',
    'CS_Staphylococcus  aureus',
    'CS_Achromobacter xylosoxidans',
    'CS_Burkholderia cepacia',
    'CS_Haemophilus influenzae'
]

culture_pairs = {
    'CS_Pseudomonas aeruginosa' : 'Pseudomonas',
    'CS_Stenophotomonas maltophilia': 'Stenotrophomonas',
    'NTM' : 'Mycobacterium',
    'CS_Mycobacteroides abscessus' : 'Mycobacteroides',
    'CS_Mycobacterium intracellulare' : 'Mycobacterium',
    'CS_Staphylococcus  aureus' : 'Staphylococcus',
    'CS_Achromobacter xylosoxidans' : 'Achromobacter',
    'CS_Burkholderia cepacia' : 'Burkholderia',
    'CS_Haemophilus influenzae' : 'Haemophilus'
}



In [16]:
# Step 2: Create Boxplots
def create_boxplot(data, culture_taxa, taxa):
    subset_data = data[data[culture_taxa].notna()]
    plt.figure(figsize=(10, 6))
    # sns.boxplot(x=culture_taxa, y=taxa, data=subset_data, color='black', boxprops=dict(facecolor='white'))
    sns.violinplot(x=culture_taxa, y=taxa, data=subset_data, hue=culture_taxa, palette='rocket_r', alpha=0.5, legend=False)
    sns.stripplot(x=culture_taxa, y=taxa, data=subset_data, color='black', jitter=True, alpha=0.5, legend=False)
    plt.title("")
    plt.xlabel("")
    plt.ylabel("Abundance in Patient")
    plt.xticks(ticks=[0, 1], labels=["Negative Culture", "Positive Culture"])
    plt.grid(False)
    plt.tight_layout()
    plt.show()

In [17]:
# Example usage
create_boxplot(merged1, "CS_Pseudomonas aeruginosa", "Pseudomonas")

In [18]:
fig, ax = plt.subplots(figsize=(18, 12), nrows=3, ncols=3, sharex=False, sharey=False)

k = 0
for i in [0, 1, 2]:
    for j in [0, 1, 2]:
        culture_taxa = culture_columns[k]
        taxa = culture_pairs[culture_taxa]
        subset_data = merged1[merged1[culture_taxa].notna()]
        g = sns.violinplot(x=culture_taxa, y=taxa, data=subset_data, hue=culture_taxa, palette='rocket_r', alpha=0.5, ax=ax[j][i], legend=False)
        g = sns.stripplot(x=culture_taxa, y=taxa, data=subset_data, color='black', jitter=True, alpha=0.5, ax=ax[j][i], legend=False)
        g.set_xlabel("")
        g.set_ylabel("Normalised Abundance in Patient")
        g.set_xlim(-0.5, 1.5)
        g.set_xticks(ticks=[0, 1], labels=["Negative Culture", "Positive Culture"])
        g.set_title(f"{culture_taxa.replace('CS_', '')} culture status")
        k+=1
        

plt.tight_layout()
plt.show()

In [19]:
# Step 3: Additionally Stats Tests
# Perform Kruskal-Wallis Test
kruskal_stat, p_value = stats.kruskal(
    merged1.loc[merged1['CS_Pseudomonas aeruginosa'] == 0.0, 'Pseudomonas'],
    merged1.loc[merged1['CS_Pseudomonas aeruginosa'] == 1.0, 'Pseudomonas']
)
print(f'Kruskal-Wallis Test statistic: {kruskal_stat}, p-value: {p_value}')

In [20]:
# Calculate median
g__CS_Pseudomonas_aeruginosa_median = merged1.groupby('CS_Pseudomonas aeruginosa', observed=True)['Pseudomonas'].median()
print(g__CS_Pseudomonas_aeruginosa_median)

In [21]:
# Filter to get the threshold value for hits in the "0" grouping
threshold_CS_Pseudomonas_aeruginosa = g__CS_Pseudomonas_aeruginosa_median.loc[1]
print(threshold_CS_Pseudomonas_aeruginosa)

In [22]:
# Find row names where the value is above the threshold
above_threshold_rownames_CS_Pseudomonas_aeruginosa = merged1[
    (merged1['CS_Pseudomonas aeruginosa'] == 0) &
    (merged1['Pseudomonas'] > threshold_CS_Pseudomonas_aeruginosa)
].index.tolist()
# Print the row names
print(above_threshold_rownames_CS_Pseudomonas_aeruginosa)

In [23]:
# Save results to CSV
pd.DataFrame(above_threshold_rownames_CS_Pseudomonas_aeruginosa).to_csv("CS_Pseudomonas.aeruginosa.csv", index=False)