# Metadata Explorer

A notebook to explore the metadata so we know where to look for things!

### Import some libraries

In [1]:
import os
import sys
from socket import gethostname

hostname = gethostname()

import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
from matplotlib.colors import ListedColormap
import pandas as pd
import seaborn as sns
import json

from itertools import cycle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error

from scipy.stats import linregress


# there is a FutureWarning in sklearn StandardScalar which is really annoying. This ignores it.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

try:
  import google.colab
  IN_COLAB = True
  !pip install adjustText
  from google.colab import drive
  drive.mount('/content/drive')
  datadir = '/content/drive/MyDrive/Projects/CF/Adelaide/CF_Data_Analysis'
except ImportError:
  IN_COLAB = False
  datadir = '..'

from adjustText import adjust_text

if hostname.startswith('hpc-node'):
    IN_DEEPTHOUGHT = True
    sys.path.append('..')
else:
    IN_DEEPTHOUGHT = False
import cf_analysis_lib

### Read the data

In [2]:
sequence_type = 'MGI_minion'
metadata = cf_analysis_lib.read_metadata(datadir, sequence_type)
metadata.head(5)

Unnamed: 0_level_0,minion,MGI,pwCF_ID,Sample date,IP vs OP,Hospital,Room,Age,Age groups,Paediatric vs Adult,...,Sum of meds,Sum of antifungals,Sum of steroid + mabs,DNA_extraction_ conc,SAGC ULN,DNA Conc. (ng/ul),Index I7,Index I5,Mean_Size_BP,Total Clusters Passing Filter (Million)
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
634207_20180510_S,,634207_20180510_S,634207,5/10/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.0,SAGCFN_22_01856,7.82,CGGACGATTC,CCACCACCTA,651,2.9
634207_20180517_S,,634207_20180517_S,634207,5/17/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.134,SAGCFN_22_01827,22.8,AGCGATAG,CCTATCCT,633,2.4
715927_20180205_S,715927_20180205_S,715927_20180205_S,715927,2/05/2018,OP,WCH,Level 6 DK Office,13,3,Paediatric,...,1,0,0,0.326,SAGCFN_22_01797,16.5,TAATGCGC,AGGCGAAG,516,3.4
715927_20180213_S,,715927_20180213_S,715927,2/13/2018,IP,WCH,Adol Room 11,13,3,Paediatric,...,3,0,0,0.234,SAGCFN_22_01811,31.0,TCCGCGAA,CCTATCCT,443,2.7
715927_20180226_S,,715927_20180226_S,715927,2/26/2018,OP,WCH,OPD 8,13,3,Paediatric,...,2,0,0,0.108,SAGCFN_22_01833,15.1,TAACTTGGTC,GATTCACGAC,510,2.6


In [17]:
t = '"; "'.join(sorted(list(metadata.columns)))
print(f'"{t}"')

"1 Cephalexin_PO"; "1 Flucloaxcillin_PO"; "1 Itraconazole (Lozenoc)_PO"; "1 Sulfamethoxazole_trimethoprim (Bactrim)_PO"; "2 Amikacin_INH"; "2 Amoxicillin & Potassium clavulanate (Aug Duo)_PO"; "2 Amphotericin B (Ambisome)_INH"; "2 Azithromycin_PO"; "2 Ceftazidime_INH"; "2 Ciprofloxacin_PO"; "2 Clarithromycin_PO"; "2 Clofazimine PO"; "2 Colistin_IHN"; "2 prednisolone_PO"; "2 tobramycin_INH"; "3 Azithromycin_IV"; "3 Aztreonam_IV"; "3 Cefopime_IV"; "3 Ceftazidime_IV"; "3 Imipenem"; "3 Ivacaftor (Kalydeco)"; "3 Meropenem_IV"; "3 Methylpredinosolone_IV"; "3 Omalizumab_SC"; "3 piperacillin sodium, tazobactam sodium (Tazocin)_IV"; "3 tobramycin_IV"; "4 Amikacin_IV"; "4 Cefoxitin_IV"; "4 Colistin_IV"; "Age"; "Age groups"; "Antibiotics (duration)"; "Antibiotics_YN"; "Best FEV1"; "CF gene 1"; "CF gene 2"; "CFLD"; "CH4/H2 ratio_corrected"; "CH4_Corrected"; "CH4_Uncorrected"; "CO2"; "CS_Achromobacter xylosoxidans"; "CS_Acremonium species"; "CS_Aspergillus flavus"; "CS_Aspergillus fumigatus"; "CS_A

In [4]:
antibiotics = ['1 Cephalexin_PO', '1 Flucloaxcillin_PO', '1 Itraconazole (Lozenoc)_PO', '1 Sulfamethoxazole_trimethoprim (Bactrim)_PO', '2 Amikacin_INH', '2 Amoxicillin & Potassium clavulanate (Aug Duo)_PO', '2 Amphotericin B (Ambisome)_INH', '2 Azithromycin_PO', '2 Ceftazidime_INH', '2 Ciprofloxacin_PO', '2 Clarithromycin_PO', '2 Clofazimine PO', '2 Colistin_IHN', '2 prednisolone_PO', '2 tobramycin_INH', '3 Azithromycin_IV', '3 Aztreonam_IV', '3 Cefopime_IV', '3 Ceftazidime_IV', '3 Imipenem', '3 Ivacaftor (Kalydeco)', '3 Meropenem_IV', '3 Methylpredinosolone_IV', '3 Omalizumab_SC', '3 piperacillin sodium, tazobactam sodium (Tazocin)_IV', '3 tobramycin_IV', '4 Amikacin_IV', '4 Cefoxitin_IV', '4 Colistin_IV']

In [18]:
t = '"; "'.join(sorted(antibiotics))
print(f'"{t}"')

"1 Cephalexin_PO"; "1 Flucloaxcillin_PO"; "1 Itraconazole (Lozenoc)_PO"; "1 Sulfamethoxazole_trimethoprim (Bactrim)_PO"; "2 Amikacin_INH"; "2 Amoxicillin & Potassium clavulanate (Aug Duo)_PO"; "2 Amphotericin B (Ambisome)_INH"; "2 Azithromycin_PO"; "2 Ceftazidime_INH"; "2 Ciprofloxacin_PO"; "2 Clarithromycin_PO"; "2 Clofazimine PO"; "2 Colistin_IHN"; "2 prednisolone_PO"; "2 tobramycin_INH"; "3 Azithromycin_IV"; "3 Aztreonam_IV"; "3 Cefopime_IV"; "3 Ceftazidime_IV"; "3 Imipenem"; "3 Ivacaftor (Kalydeco)"; "3 Meropenem_IV"; "3 Methylpredinosolone_IV"; "3 Omalizumab_SC"; "3 piperacillin sodium, tazobactam sodium (Tazocin)_IV"; "3 tobramycin_IV"; "4 Amikacin_IV"; "4 Cefoxitin_IV"; "4 Colistin_IV"


In [19]:
combinations = {
'Amikacin' : ['2 Amikacin_INH', '4 Amikacin_IV'],
'Azithromycin' : ['2 Azithromycin_PO', '3 Azithromycin_IV'],
'Tobramycin' : ['2 tobramycin_INH', '3 tobramycin_IV'],
'Colistin' : ['2 Colistin_IHN', '4 Colistin_IV'],
'Ceftazidime' : ['2 Ceftazidime_INH', '3 Ceftazidime_IV'],
'Prednisolone' : ['2 prednisolone_PO', '3 Methylpredinosolone_IV']
}

In [20]:
metadata[(metadata['2 Amikacin_INH'] == 1) | (metadata['4 Amikacin_IV'] == 1)]

Unnamed: 0_level_0,minion,MGI,pwCF_ID,Sample date,IP vs OP,Hospital,Room,Age,Age groups,Paediatric vs Adult,...,Sum of meds,Sum of antifungals,Sum of steroid + mabs,DNA_extraction_ conc,SAGC ULN,DNA Conc. (ng/ul),Index I7,Index I5,Mean_Size_BP,Total Clusters Passing Filter (Million)
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
770560_20190109_S,,770560_20190109_S,770560,1/09/2019,IP,WCH,Adolescent 4,12,3,Paediatric,...,4,0,0,0.071,SAGCFN_22_01842,12.0,CTTGCTATTC,CGTGACTCTA,395,4.2
770560_20181218_S,,770560_20181218_S,770560,12/18/2018,IP,WCH,Adolescent 4,12,3,Paediatric,...,3,0,0,0.161,SAGCFN_22_01821,14.9,TCTCGCGC,AGGCGAAG,630,3.2
670829_20171001_S,,670829_20171001_S,670829,10/01/2017,IP,WCH,Adolescent 6,15,3,Paediatric,...,1,0,0,0.28,SAGCFN_22_01803,41.8,CGGCTATG,CCTATCCT,375,2.2
670829_20171005_S,,670829_20171005_S,670829,10/05/2017,IP,WCH,Adolescent 6,15,3,Paediatric,...,1,0,0,0.102,SAGCFN_22_01836,5.16,ATGTTGCCAC,ATGGCTTCTA,668,4.0
650003_20180207_S,650003_20180207_S,650003_20180207_S,650003,2/07/2018,IP,WCH,Adol Room 1,17,3,Paediatric,...,5,0,0,0.552,SAGCFN_22_01778,8.28,GAATTCGT,ATAGAGGC,476,5.7
650003_20180213_S,,650003_20180213_S,650003,2/13/2018,IP,WCH,Level 6 Treatment,17,3,Paediatric,...,5,0,0,0.289,SAGCFN_22_01800,27.6,TAATGCGC,GTACTGAC,572,3.5
670829_20180201_S,,670829_20180201_S,670829,2/01/2018,IP,WCH,Day Surgery 17,16,3,Paediatric,...,2,1,0,0.081,SAGCFN_22_01840,8.58,AGGTAACAAC,AGCATCGTGC,469,3.1
698917_20180119_S,698917_20180119_S,698917_20180119_S,698917,1/19/2018,IP,WCH,Adol Room 1,14,3,Paediatric,...,3,0,0,1.76,SAGCFN_22_01747,23.2,TCCGGAGA,CCTATCCT,383,6.7
698917_20180128_S,698917_20180128_S,698917_20180128_S,698917,1/28/2018,IP,WCH,Adol Room 1,14,3,Paediatric,...,3,0,0,0.337,SAGCFN_22_01793,21.0,TAATGCGC,TATAGCCT,495,2.2
698917_20180329_S,,698917_20180329_S,698917,3/29/2018,OP,WCH,Day Surgery 19,14,3,Paediatric,...,6,0,0,0.0554,SAGCFN_22_01847,5.6,GAACCGCATT,AGGCATAATG,345,2.6


In [21]:
tmpdf = pd.DataFrame()
for new_col, col_array in combinations.items():
    tmpdf[new_col] = metadata[col_array].any(axis=1).astype(int)
tmpdf

Unnamed: 0_level_0,Amikacin,Azithromycin,Tobramycin,Colistin,Ceftazidime,Prednisolone
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
634207_20180510_S,0,0,1,0,0,0
634207_20180517_S,0,0,1,0,0,0
715927_20180205_S,0,0,0,0,0,0
715927_20180213_S,0,0,1,0,0,0
715927_20180226_S,0,0,1,0,0,0
...,...,...,...,...,...,...
1651490_20180206_S,0,0,1,0,0,0
1651490_20171215_S,0,0,1,0,0,0
1658447_20171006_S,0,0,0,0,1,0
1664053_20180406_S,0,1,0,1,1,0


In [9]:
tmpdf.sum()

Amikacin        10
Azithromycin     8
Tobramycin      47
Colistin         4
Ceftazidime     22
Prednisolone    10
dtype: int64

In [26]:
len(metadata['pwCF_ID'].unique())

64

Generate some statistics about our samples

In [41]:
sequence_type='MGI'

for taxa in ['kingdom', 'order', 'phylum', 'class', 'family', 'genus']:
    genus_otu = cf_analysis_lib.read_taxonomy(datadir, sequence_type, taxa)
    genus_otu = genus_otu.T
    print(f"The taxonomy df for {taxa} has shape: {genus_otu.shape}")

metadata = cf_analysis_lib.read_metadata(datadir, sequence_type, categorise=True)
print(f"The metadata df has shape: {metadata.shape}")
df = ss_df.merge(genus_otu, left_index=True, right_index=True, how='inner')
print(df.shape)
df.head(5)

The taxonomy df for kingdom has shape: (127, 1)
The taxonomy df for order has shape: (127, 302)
The taxonomy df for phylum has shape: (127, 164)
The taxonomy df for class has shape: (127, 140)
The taxonomy df for family has shape: (127, 743)
The taxonomy df for genus has shape: (127, 3581)
The metadata df has shape: (127, 166)
(127, 4350)


Unnamed: 0,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",2-O-alpha-mannosyl-D-glycerate utilization,2-aminophenol Metabolism,2-ketoacid oxidoreductases disambiguation,2-oxoglutarate dehydrogenase,2-phosphoglycolate salvage,3-amino-5-hydroxybenzoic Acid Synthesis,4-hydroxybenzoyl-CoA reductase,5-methylaminomethyl-2-thiouridine,A Hypothetical Protein Related to Proline Metabolism,...,Oceaniferula,Persicirhabdus,Phragmitibacter,Prosthecobacter,Roseibacillus,Roseimicrobium,Sulfuriroseicoccus,Verrucomicrobium,Eremiobacter,Methylomirabilis
1068841_20180306_S,10.085904,2784.895948,516.160945,231.778018,122.573392,342.327431,783.041862,0.0,4136.703664,827.439614,...,1.347606,0.0,0.0,0.0,0.539042,0.0,0.0,0.539042,0.0,0.0
1447437_20171212_S,59.260325,1065.438272,543.947408,554.136026,428.171446,301.915763,679.310468,0.0,3913.676651,732.956657,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128691_20171206_S,0.0,426.619709,912.76775,49.606943,213.640568,277.79888,423.31258,0.0,2093.412992,236.459761,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128691_20171218_S,0.0,659.087578,864.137047,139.140711,235.318676,355.174973,538.254855,0.0,2050.494687,341.749115,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128691_20180116_S,14.478968,159.268644,593.637673,48.263225,358.354449,202.705547,348.701804,0.0,1570.967988,205.118708,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
for taxa in ['kingdom', 'order', 'phylum', 'class', 'family', 'genus']:
    genus_otu = cf_analysis_lib.read_taxonomy(datadir, sequence_type, taxa)
    genus_otu = genus_otu.T
    print(f"we have {genus_otu.shape[1]} taxonomic group for {taxa}, and ", end="")
print()

we have 1 taxonomic group for kingdom, and we have 302 taxonomic group for order, and we have 164 taxonomic group for phylum, and we have 140 taxonomic group for class, and we have 743 taxonomic group for family, and we have 3581 taxonomic group for genus, and 


In [43]:
for sslevel in ['all_norm_ss.tsv.gz', 'class_norm_ss.tsv.gz', 'level1_norm_ss.tsv.gz', 'level2_norm_ss.tsv.gz', 'subsystems_norm_ss.tsv.gz']:
    ss_df = cf_analysis_lib.read_subsystems(os.path.join(datadir, sequence_type, "FunctionalAnalysis", "subsystems", sslevel), sequence_type)
    ss_df = ss_df.T
    print(f"we have {ss_df.shape[1]} features for {sslevel.replace('_norm_ss.tsv.gz', '')} subsystems, and ", end="")
print()

we have 769 features for all subsystems, and we have 11 features for class subsystems, and we have 29 features for level1 subsystems, and we have 139 features for level2 subsystems, and we have 769 features for subsystems subsystems, and 


In [35]:
sslevel = 'all_norm_ss.tsv.gz'
ss_df = cf_analysis_lib.read_subsystems(os.path.join(datadir, sequence_type, "FunctionalAnalysis", "subsystems", sslevel), sequence_type)
ss_df = ss_df.T
print(f"The subsystems df for {sslevel} has shape: {ss_df.shape}")

The subsystems df for all_norm_ss.tsv.gz has shape: (127, 769)


In [36]:
os.path.join(datadir, sequence_type, "FunctionalAnalysis", "subsystems", sslevel)

'../MGI/FunctionalAnalysis/subsystems/all_norm_ss.tsv.gz'

In [37]:
ss_df

Unnamed: 0,"Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; Alginate biosynthesis","Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; Colanic acid synthesis","Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; Extracellular matrix proteins (PEL) involved in glucose-rich biofilm formation in Pseudomonas","Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; Extracellular matrix proteins (PSL) involved in mannose-rich biofilm formation in Pseudomonas","Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; Hyaluronic Acid-containing Cell Walls","Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; Lipid-linked oligosaccharide synthesis related cluster","Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; O-methyl phosphoramidate modification in capsular polysaccharide","Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; Protein-O-mannosyltransferase and 16S rRNA (cytidine(1402)-2'-O)-methyltransferase cluster","Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; Rcs two-component regulator of capsule synthesis","Cell Envelope; Cell Envelope, Capsule and Slime layer; Capsule and Slime layer; Streptococcal Hyaluronic Acid Capsule",...,"Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Stress Response; Rcn nickel and cobalt homeostasis system","Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Stress Response; Repair of Iron Centers","Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Stress Response; Stress proteins YciF, YciE","Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Stress Response; Stress response and cell wall lysis cluster","Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Stress Response; Sugar-phosphate stress regulation","Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Stress Response; Universal stress protein family","Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Toxins and superantigens; Cholera toxin","Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Toxins and superantigens; Diphtheria toxin","Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Toxins and superantigens; Pore-forming cytolytic toxins","Stress Response, Defense, Virulence; Stress Response, Defense and Virulence; Toxins and superantigens; Toxins"
1068841_20180306_S,1808.639816,131.710034,1.186577,0.000000,0.000000,177.393244,0.000000,1308.794304,0.0,689.104526,...,1585.860006,2125.752489,0.000000,623.249509,0.000000,495.395849,0.000000,137.049630,3.559731,0.000000
1447437_20171212_S,1762.214941,194.623385,89.826178,54.893775,1.247586,165.928911,2.495172,731.085278,0.0,491.548805,...,764.770095,1028.634491,0.000000,326.867479,0.000000,617.554970,22.456544,39.922746,2435.287480,1397.296095
1128691_20171206_S,4866.441102,213.309855,5317.864282,2847.438524,0.000000,29.764166,0.000000,163.702912,0.0,148.820829,...,1091.352744,1398.915791,783.789698,357.169989,0.000000,1785.849946,932.610527,34.724860,0.000000,29.764166
1128691_20171218_S,4020.434226,205.049469,4086.342984,2372.715281,0.000000,87.878344,0.000000,344.190180,0.0,161.110297,...,1816.152437,1684.334922,893.429828,263.635031,0.000000,1259.589593,395.452547,0.000000,190.403078,0.000000
1128691_20180116_S,4626.030157,180.987095,4879.412090,3764.531583,0.000000,0.000000,0.000000,152.029160,0.0,101.352773,...,1346.543989,1592.686439,897.695993,209.945031,0.000000,1722.997148,752.906317,36.197419,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895293_20180502_S,2342.455455,146.521629,135.178019,96.420685,0.000000,552.055684,0.000000,1360.287893,0.0,683.452500,...,1135.306296,1868.387089,0.000000,630.043003,0.000000,617.281441,49.155643,177.716556,0.000000,2.835902
896213_20180427_S,2084.983042,105.836703,105.836703,84.669362,0.000000,740.856918,0.000000,2074.399372,0.0,635.020216,...,1693.387242,1375.877134,0.000000,635.020216,0.000000,550.350854,0.000000,232.840746,0.000000,0.000000
913873_20180417_S,1596.018291,145.092572,0.000000,48.364191,0.000000,677.098669,0.000000,1523.472005,0.0,701.280764,...,2224.752770,2128.024388,0.000000,290.185144,96.728381,532.006097,0.000000,72.546286,0.000000,145.092572
980574_20180403_S,2994.036655,155.445409,50.585625,43.208554,0.000000,73.770703,0.000000,1541.807683,0.0,679.217397,...,1137.122686,2756.389606,11.592539,482.144234,0.000000,368.853513,6.323203,231.850779,0.000000,2.107734
