# Metadata Explorer

A notebook to explore the metadata so we know where to look for things!

### Import some libraries

In [1]:
import os
import sys
from socket import gethostname

hostname = gethostname()

import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
from matplotlib.colors import ListedColormap
import pandas as pd
import seaborn as sns
import json

from itertools import cycle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error

from scipy.stats import linregress


# there is a FutureWarning in sklearn StandardScalar which is really annoying. This ignores it.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

try:
  import google.colab
  IN_COLAB = True
  !pip install adjustText
  from google.colab import drive
  drive.mount('/content/drive')
  datadir = '/content/drive/MyDrive/Projects/CF/Adelaide/CF_Data_Analysis'
except ImportError:
  IN_COLAB = False
  datadir = '..'

from adjustText import adjust_text

if hostname.startswith('hpc-node'):
    IN_DEEPTHOUGHT = True
    sys.path.append('..')
else:
    IN_DEEPTHOUGHT = False
import cf_analysis_lib

### Read the data

In [2]:
sequence_type = 'MGI_minion'
metadata = cf_analysis_lib.read_metadata(datadir, sequence_type)
metadata.head(5)

Unnamed: 0_level_0,minion,MGI,pwCF_ID,Sample date,IP vs OP,Hospital,Room,Age,Age groups,Paediatric vs Adult,...,Sum of meds,Sum of antifungals,Sum of steroid + mabs,DNA_extraction_ conc,SAGC ULN,DNA Conc. (ng/ul),Index I7,Index I5,Mean_Size_BP,Total Clusters Passing Filter (Million)
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
634207_20180510_S,,634207_20180510_S,634207,5/10/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.0,SAGCFN_22_01856,7.82,CGGACGATTC,CCACCACCTA,651,2.9
634207_20180517_S,,634207_20180517_S,634207,5/17/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.134,SAGCFN_22_01827,22.8,AGCGATAG,CCTATCCT,633,2.4
715927_20180205_S,715927_20180205_S,715927_20180205_S,715927,2/05/2018,OP,WCH,Level 6 DK Office,13,3,Paediatric,...,1,0,0,0.326,SAGCFN_22_01797,16.5,TAATGCGC,AGGCGAAG,516,3.4
715927_20180213_S,,715927_20180213_S,715927,2/13/2018,IP,WCH,Adol Room 11,13,3,Paediatric,...,3,0,0,0.234,SAGCFN_22_01811,31.0,TCCGCGAA,CCTATCCT,443,2.7
715927_20180226_S,,715927_20180226_S,715927,2/26/2018,OP,WCH,OPD 8,13,3,Paediatric,...,2,0,0,0.108,SAGCFN_22_01833,15.1,TAACTTGGTC,GATTCACGAC,510,2.6


In [3]:
sorted(list(metadata.columns))

['1 Cephalexin_PO',
 '1 Flucloaxcillin_PO',
 '1 Itraconazole (Lozenoc)_PO',
 '1 Sulfamethoxazole_trimethoprim (Bactrim)_PO',
 '2 Amikacin_INH',
 '2 Amoxicillin & Potassium clavulanate (Aug Duo)_PO',
 '2 Amphotericin B (Ambisome)_INH',
 '2 Azithromycin_PO',
 '2 Ceftazidime_INH',
 '2 Ciprofloxacin_PO',
 '2 Clarithromycin_PO',
 '2 Clofazimine PO',
 '2 Colistin_IHN',
 '2 prednisolone_PO',
 '2 tobramycin_INH',
 '3 Azithromycin_IV',
 '3 Aztreonam_IV',
 '3 Cefopime_IV',
 '3 Ceftazidime_IV',
 '3 Imipenem',
 '3 Ivacaftor (Kalydeco)',
 '3 Meropenem_IV',
 '3 Methylpredinosolone_IV',
 '3 Omalizumab_SC',
 '3 piperacillin sodium, tazobactam sodium (Tazocin)_IV',
 '3 tobramycin_IV',
 '4 Amikacin_IV',
 '4 Cefoxitin_IV',
 '4 Colistin_IV',
 'Age',
 'Age groups',
 'Antibiotics (duration)',
 'Antibiotics_YN',
 'Best FEV1',
 'CF gene 1',
 'CF gene 2',
 'CFLD',
 'CH4/H2 ratio_corrected',
 'CH4_Corrected',
 'CH4_Uncorrected',
 'CO2',
 'CS_Achromobacter xylosoxidans',
 'CS_Acremonium species',
 'CS_Aspergillu

In [4]:
antibiotics = ['1 Cephalexin_PO', '1 Flucloaxcillin_PO', '1 Itraconazole (Lozenoc)_PO', '1 Sulfamethoxazole_trimethoprim (Bactrim)_PO', '2 Amikacin_INH', '2 Amoxicillin & Potassium clavulanate (Aug Duo)_PO', '2 Amphotericin B (Ambisome)_INH', '2 Azithromycin_PO', '2 Ceftazidime_INH', '2 Ciprofloxacin_PO', '2 Clarithromycin_PO', '2 Clofazimine PO', '2 Colistin_IHN', '2 prednisolone_PO', '2 tobramycin_INH', '3 Azithromycin_IV', '3 Aztreonam_IV', '3 Cefopime_IV', '3 Ceftazidime_IV', '3 Imipenem', '3 Ivacaftor (Kalydeco)', '3 Meropenem_IV', '3 Methylpredinosolone_IV', '3 Omalizumab_SC', '3 piperacillin sodium, tazobactam sodium (Tazocin)_IV', '3 tobramycin_IV', '4 Amikacin_IV', '4 Cefoxitin_IV', '4 Colistin_IV']

In [5]:
antibiotics

['1 Cephalexin_PO',
 '1 Flucloaxcillin_PO',
 '1 Itraconazole (Lozenoc)_PO',
 '1 Sulfamethoxazole_trimethoprim (Bactrim)_PO',
 '2 Amikacin_INH',
 '2 Amoxicillin & Potassium clavulanate (Aug Duo)_PO',
 '2 Amphotericin B (Ambisome)_INH',
 '2 Azithromycin_PO',
 '2 Ceftazidime_INH',
 '2 Ciprofloxacin_PO',
 '2 Clarithromycin_PO',
 '2 Clofazimine PO',
 '2 Colistin_IHN',
 '2 prednisolone_PO',
 '2 tobramycin_INH',
 '3 Azithromycin_IV',
 '3 Aztreonam_IV',
 '3 Cefopime_IV',
 '3 Ceftazidime_IV',
 '3 Imipenem',
 '3 Ivacaftor (Kalydeco)',
 '3 Meropenem_IV',
 '3 Methylpredinosolone_IV',
 '3 Omalizumab_SC',
 '3 piperacillin sodium, tazobactam sodium (Tazocin)_IV',
 '3 tobramycin_IV',
 '4 Amikacin_IV',
 '4 Cefoxitin_IV',
 '4 Colistin_IV']

In [7]:
combinations = {
'Amikacin' : ['2 Amikacin_INH', '4 Amikacin_IV'],
'Azithromycin' : ['2 Azithromycin_PO', '3 Azithromycin_IV'],
'Tobramycin' : ['2 tobramycin_INH', '3 tobramycin_IV'],
'Colistin' : ['2 Colistin_IHN', '4 Colistin_IV'],
'Ceftazidime' : ['2 Ceftazidime_INH', '3 Ceftazidime_IV'],
'Prednisolone' : ['2 prednisolone_PO', '3 Methylpredinosolone_IV']
}

In [12]:
metadata[(metadata['2 Amikacin_INH'] == 1) | (metadata['4 Amikacin_IV'] == 1)]

In [13]:
tmpdf = pd.DataFrame()
for new_col, col_array in combinations.items():
    tmpdf[new_col] = metadata[col_array].any(axis=1).astype(int)
tmpdf

In [14]:
tmpdf.sum()

In [7]:
list(metadata.dtypes)

[dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('O'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtyp

In [8]:
mdc = cf_analysis_lib.read_metadata(datadir, sequence_type, categorise=True)
list(mdc.dtypes)

[dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('O'),
 CategoricalDtype(categories=['IP', 'OP'], ordered=False),
 CategoricalDtype(categories=['RAH', 'WCH'], ordered=False),
 dtype('O'),
 dtype('int64'),
 CategoricalDtype(categories=[1, 2, 3, 4, 5, 6, 7], ordered=False),
 CategoricalDtype(categories=['Adult', 'Paediatric'], ordered=False),
 CategoricalDtype(categories=['F', 'M'], ordered=False),
 CategoricalDtype(categories=['BAL', 'S'], ordered=False),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('O'),
 CategoricalDtype(categories=[0.0, 1.0], ordered=False),
 dtype('float64'),
 dtype('O'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('float64'),
 CategoricalDtype(categories=[1.0, 2.0, 3.0], ordered=False),
 CategoricalDtype(categories=[1.0, 2.0], ordered=False),
 CategoricalDtype(categories=

In [15]:
list(mdc['N12M_Pseudomonas aeruginosa'].cat.codes)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]