# Read the Metadata and explore it!

In [1]:
import os
import sys
from socket import gethostname

# this is for ffmpeg
os.environ['PATH'] += os.pathsep + os.path.expanduser('~/miniconda3/envs/bioinformatics/bin')

hostname = gethostname()

if hostname.startswith('hpc-node'):
    IN_DEEPTHOUGHT = True
    sys.path.append('..')
else:
    IN_DEEPTHOUGHT = False
from cf_analysis_lib.load_libraries import *
import cf_analysis_lib

sequence_type = "MGI"
datadir = '..'
sslevel = 'subsystems_norm_ss.tsv.gz'
taxa = "family"

df, metadata = cf_analysis_lib.read_the_data(sequence_type, datadir, sslevel='subsystems_norm_ss.tsv.gz', taxa="family")

Using ../MGI/FunctionalAnalysis/subsystems/MGI_subsystems_norm_ss.tsv.gz for the subsystems


In [2]:
for c in metadata.columns:
    if 'pseudo' in c.lower():
        print(c)

Pseudomonas Culture
CS_Pseudomonas aeruginosa
CS_MDR_Pseudomonas aeruginosa
P12M_2_Pseudomonas aeruginosa
N12M_Pseudomonas aeruginosa
N12M_Pseudomonas aeruginosa (multi-resistant)
O_Pseudomonas aeruginosa (multi-resistant)
O_Pseudomonas aeruginosa (non-mucoid)
O_Pseudomonas aeruginosa (mucoid)


In [30]:
for c in metadata.columns:
    if 'n12m' in c.lower():
        print(c)

N12M_Pseudomonas aeruginosa
N12M_Pseudomonas aeruginosa (multi-resistant)
N12M_mucoid
N12M_non_mucoid
N12M_MAC
N12M_M intracellulare
N12M_Candida albicans
N12M_Achromobacter xylosoxidans
N12M_Aspergillus fumigatus
N12M_Stenotrophomonas maltophilia
N12M_Staphylococcus aureus
N12M_M gordonae
N12M_M chelonae


In [16]:
# for this purpose we are going to consider NaN as 0. There are two samples with Nan in Pseudomonas culture: '676138_20180405_S', '770560_20181218_S'
metadata[['Pseudomonas Culture', 'CS_Pseudomonas aeruginosa']] = metadata[['Pseudomonas Culture', 'CS_Pseudomonas aeruginosa']].fillna(0)

In [18]:
pseudosamps = metadata[(metadata['Pseudomonas Culture'] == 1) | (metadata['CS_Pseudomonas aeruginosa'] == 1)].shape[0]
npseudo = len(metadata[(metadata['Pseudomonas Culture'] == 1) | (metadata['CS_Pseudomonas aeruginosa'] == 1)]['pwCF_ID'].unique())
samps = metadata.shape[0]
n = len(metadata['pwCF_ID'].unique())
print(f"In our cohort, Pseudomonas was cultured from {pseudosamps} samples from {npseudo} pwCF, whereas it was not cultured from {samps-pseudosamps} samples from", end=" ")
print(f"{n-npseudo} pwCF. Eight pwCF had some samples where Pseudomonas was not cultured, and others in which it was.")

In our cohort, Pseudomonas was cultured from 36 samples from 28 pwCF, whereas it was not cultured from 91 samples from 36 pwCF. Eight pwCF had some samples where Pseudomonas was not cultured, and others in which it was.


In [25]:
pseudo_positive_idx = set(metadata[(metadata['Pseudomonas Culture'] == 1) | (metadata['CS_Pseudomonas aeruginosa'] == 1)].index)
pseudo_negative_idx = set(metadata[(metadata['Pseudomonas Culture'] == 0) & (metadata['CS_Pseudomonas aeruginosa'] == 0)].index)
print(f"There are {len(pseudo_positive_idx)} Pseudomonas-positive samples, and {len(pseudo_negative_idx)} Pseudomonas negative samples ({len(pseudo_positive_idx)+len(pseudo_negative_idx)} total)")

There are 36 Pseudomonas-positive samples, and 91 Pseudomonas negative samples (127 total)


In [24]:
pseudo_positive = set(metadata[(metadata['Pseudomonas Culture'] == 1) | (metadata['CS_Pseudomonas aeruginosa'] == 1)]['pwCF_ID'])
pseudo_negative = set(metadata[(metadata['Pseudomonas Culture'] == 0) & (metadata['CS_Pseudomonas aeruginosa'] == 0)]['pwCF_ID'])
print(f"There are {len(pseudo_positive)} Pseudomonas-positive pwCF, and {len(pseudo_negative)} Pseudomonas negative pwCF ({len(pseudo_positive)+len(pseudo_negative)} total)")
print(f"There are {len(metadata['pwCF_ID'].unique())} people in total")
print(f"The following people are positive sometimes and negative sometimes: {pseudo_positive.intersection(pseudo_negative)}")

There are 28 Pseudomonas-positive pwCF, and 44 Pseudomonas negative pwCF (72 total)
There are 64 people in total
The following people are positive sometimes and negative sometimes: {748160, 788707, 698564, 676138, 650003, 658355, 715927, 748699}


# who was baseline negative but converted in N12M

In [35]:
ps_neg_n12m = metadata[metadata.index.isin(pseudo_negative_idx) & (metadata['N12M_Pseudomonas aeruginosa'] == 1)]
people_pnn = metadata[metadata.index.isin(pseudo_negative_idx) & (metadata['N12M_Pseudomonas aeruginosa'] == 1)]['pwCF_ID'].unique()
print(f"There are {ps_neg_n12m.shape[0]} samples that are Pseudomonas negative and become positive in the next 12 months from {len(people_pnn)} unique pwCF")

There are 36 samples that are Pseudomonas negative and become positive in the next 12 months from 18 unique pwCF


In [36]:
ps_neg_always = metadata[metadata.index.isin(pseudo_negative_idx) & (metadata['N12M_Pseudomonas aeruginosa'] == 0)]
people_pna = metadata[metadata.index.isin(pseudo_negative_idx) & (metadata['N12M_Pseudomonas aeruginosa'] == 0)]['pwCF_ID'].unique()
print(f"There are {ps_neg_always.shape[0]} samples from {len(people_pna)} unique pwCF") 

There are 38 samples from 20 unique pwCF
