# Correcting the metadata

There are a couple of issues with the metadata being mis-labeled, and this notebook just checks those.


In [16]:
import os
import sys

import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.colors import ListedColormap
import matplotlib.dates as mdates
import pandas as pd
import seaborn as sns
import json

from itertools import cycle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from adjustText import adjust_text

# there is a FutureWarning in sklearn StandardScalar which is really annoying. This ignores it.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [17]:
def read_taxonomy(tax_file, firstchar):
    """
    Read the taxonomy file and return a data frame
    """
    
    df = pd.read_csv(tax_file, sep='\t', compression='gzip')
    df = df[df['taxonomy'].str.contains('k__Bacteria')]
    df = df[~df['taxonomy'].str.endswith(f'{firstchar}__')]
    df = df.set_index('taxonomy')
    df.index = df.index.str.split(';').str[-1]
    df = df.sort_index(axis=1)
    return df

def sorted_presence_absence(df1, df2, minrowsum=0, asc_sort=False):
    """
    Join the two tables and return the sorted version
    """
    # filter so we only include samples sequenced on both MGI and MinION
    common_columns = df1.columns.intersection(df2.columns)
    df1_both = df1[common_columns]
    df2_both = df2[common_columns]
    
    # create a presence/absence matrix
    df1_presence = (df1_both > 0).astype(int)
    df2_presence = (df2_both > 0).astype(int)*2

    # here we filter on the minimum number of columns each taxa is in if requested
    if minrowsum > 0:
        df1_presence = df1_presence.loc[df1_presence[df1_presence.sum(axis=1) > minrowsum].index]
        df2_presence = df2_presence.loc[df2_presence[df2_presence.sum(axis=1) > (2 * minrowsum)].index]
    
    # combine the two matrices and sort them
    both = df1_presence.add(df2_presence, fill_value=0)
    sboth = both.loc[both.sum(axis=1).sort_values(ascending=asc_sort).index]
    sboth = sboth.sort_index(axis=1) # sort by column names

    return sboth

In [18]:
corrections = {
    "MGI_ID" : { 
        '1112926_20171212_S' : '1447437_20171212_S',
        '1128691_20170206_S' : '1128691_20171206_S',
        '1255498_20171212_S' : '1590009_20171212_S',
        '1316979_20171215_S' : '1651490_20171215_S',
        '1598281_20180508_S' : '1588281_20180508_S',
        '1723809_20180227_S' : '1085876_20180227_S',
        '649354_20170206_S' : '639354_20171206_S',
        '652927_20180226_S' : '715927_20180226_S',
        '658355_20180301_S' : '658355_20180327_S',
        '777851_20170918_S' : '778851_20170918_S',
        '788707_20181126_S' : '788707_20181129_S'
    },
    "minion_ID" : {
        '1112926_20171212_S' : '1447437_20171212_S',
        '1255498_20171212_S' : '1590009_20171212_S',
        '1316979_20171215_S' : '1651490_20171215_S',
        '1598281_20180508_S' : '1588281_20180508_S',
        '698917_20190119_S' : '698917_20180119_S'
        }
}

In [20]:
tax='genus'
# read the data 
mgi_df = read_taxonomy(f"../MGI/Taxonomy/MGI_reads_{tax}.normalised.tsv.gz", tax[0])
min_df = read_taxonomy(f"../MinION/Taxonomy/Minion_read_based_annotations_{tax}.normalised.tsv.gz", tax[0])

sequence_type = 'MGI_ID'
print("Checking corrections keys", file=sys.stderr)
# check the columns
for s in corrections[sequence_type]:
    if s in mgi_df.columns:
        print(f"Old name:{s} in {sequence_type}", file=sys.stderr)

        
sequence_type = 'minion_ID'
print("Checking corrections keys", file=sys.stderr)
# check the columns
for s in corrections[sequence_type]:
    if s in min_df.columns:
        print(f"Old name:{s} in {sequence_type}", file=sys.stderr)
        
mgi_df = mgi_df.rename(columns=corrections['MGI_ID'])
min_df = min_df.rename(columns=corrections['minion_ID'])

#df = mgi_df.T
df = min_df.T
df.head()


Checking corrections keys
Old name:1112926_20171212_S in MGI_ID
Old name:1255498_20171212_S in MGI_ID
Old name:1316979_20171215_S in MGI_ID
Old name:1598281_20180508_S in MGI_ID
Old name:1723809_20180227_S in MGI_ID
Old name:652927_20180226_S in MGI_ID
Old name:658355_20180301_S in MGI_ID
Old name:777851_20170918_S in MGI_ID
Old name:788707_20181126_S in MGI_ID
Checking corrections keys
Old name:1112926_20171212_S in minion_ID
Old name:1255498_20171212_S in minion_ID
Old name:1316979_20171215_S in minion_ID
Old name:1598281_20180508_S in minion_ID
Old name:698917_20190119_S in minion_ID


taxonomy,g__Candidatus Stahlbacteria,g__Candidatus Chromulinivorax,g__Bradymonas,g__Microvenator,g__Candidatus Acididesulfobacter,g__Candidatus Acidulodesulfobacterium,g__Acanthopleuribacter,g__Candidatus Sulfopaludibacter,g__Edaphobacter,g__Granulicella,...,g__Coraliomargarita,g__Pelagicoccus,g__Puniceicoccus,g__Chthoniobacter,g__Pedosphaera,g__Akkermansia,g__Brevifollis,g__Luteolibacter,g__Verrucomicrobium,g__Candidatus Eremiobacter
1068841_20180306_S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1447437_20171212_S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128691_20171218_S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128691_20180116_S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1590009_20171212_S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,18.676403,0.0,0.0,0.0,0.0


In [21]:
print(", ".join(list(df.index)))

1068841_20180306_S, 1447437_20171212_S, 1128691_20171218_S, 1128691_20180116_S, 1590009_20171212_S, 1282052_20180206_S, 1316935_20180417_S, 1651490_20171215_S, 1447437_20171006_S, 1470026_20180502_S, 1565754_20171128_S, 1565754_20180403_S, 1586713_20180309_S, 1593967_20180424_S, 1593973_20180427_S, 1593973_20180504_S, 1588281_20180508_S, 1651490_20171010_S, 1651490_20180206_S, 1834617_20180501_S, 1845116_20180403_S, 623361_20180123_S, 639354_20171206_S, 642660_20180601_S, 650003_20180207_S, 658355_20171204_S, 658355_20180122_S, 658355_20180321_S, 673895_20180122_S, 673895_20180205_S, 676138_20180130_S, 698917_20171207_S, 698917_20180128_S, 698917_20180119_S, 715927_20180205_S, 748160_20180321_S, 748160_20180329_S, 748699_20180329_S, 748699_20180410_S, 752797_20170927_S, 753522_20180606_S, 756934_20181218_S, 763742_20180129_S, 768745_20171123_S, 770590_20180115_S, 778851_20171204_S, 785991_20171129_S, 785991_20171206_S, 785991_20180321_S, 788707_20171213_S, 788707_20180301_S, 788707_201

In [22]:
# Load metadata table
# This is generic so we can copy/paste. Yes, it should be a function, but its not.

# metadata = pd.read_csv("../Metadata/Metadata20241026.txt", encoding='utf-8', sep="\t", index_col=0)
metadata = pd.read_csv("../Metadata/Metadata.txt", encoding='windows-1252', sep="\t", index_col=0)
metadata = metadata[~metadata[sequence_type].isna()]


In [23]:
print(", ".join(list(metadata.index)))

623361_20180123_S, 639354_20171206_S, 642660_20180601_S, 650003_20180207_S, 658355_20171204_S, 658355_20180122_S, 658355_20180321_S, 673895_20180205_S, 673895_20180122_S, 676138_20180130_S, 698917_20171207_S, 698917_20180119_S, 698917_20180128_S, 715927_20180205_S, 748160_20180321_S, 748160_20180329_S, 748699_20180410_S, 748699_20180329_S, 752797_20170927_S, 753522_20180606_S, 756934_20181218_S, 763742_20180129_S, 768745_20171123_S, 770590_20180115_S, 770590_20170925_S, 778851_20171204_S, 785991_20171206_S, 785991_20171129_S, 785991_20180321_S, 788707_20180301_S, 788707_20181116_S, 788707_20171213_S, 788707_20180313_S, 802971_20180605_S, 825012_20181120_S, 825012_20181126_S, 875028_20180115_S, 892355_20180123_S, 895293_20180502_S, 983493_20180123_S, 1068841_20180306_S, 1128691_20180116_S, 1128691_20171218_S, 1282052_20180206_S, 1316935_20180417_S, 1447437_20171006_S, 1447437_20171212_S, 1470026_20180502_S, 1565754_20180403_S, 1565754_20171128_S, 1586713_20180309_S, 1588281_20180508_S, 

In [25]:
s='623361_20180123_S'
metadata.loc[s,]

minion_ID       623361_20180123_S
MGI_ID          623361_20180123_S
pwCF_ID                    623361
Sample date             1/23/2018
IP vs OP                       OP
                      ...        
Unnamed: 167                  NaN
Unnamed: 168                  NaN
Unnamed: 169                  NaN
Unnamed: 170                  NaN
Unnamed: 171                  NaN
Name: 623361_20180123_S, Length: 171, dtype: object

In [26]:

# check the names before we replace them
for s in corrections['MGI_ID']:
    if s in set(metadata[sequence_type].values):
        print(f"{s} in MGI Metadata", file=sys.stderr)



for ix in metadata.index:
    s = metadata.loc[ix, sequence_type]
    if s in corrections[sequence_type]:
        metadata.loc[ix, sequence_type] = corrections[sequence_type][s]

todrop = []
for s in metadata[sequence_type].values:
    if s not in df.index:
        print(f"ERROR: {s} not found in data frame, dropped from metadata", file=sys.stderr)
        todrop.append(s)

metadata.drop(todrop, inplace=True)

todrop = []
for s in df.index:
    if not metadata[sequence_type].str.contains(s).any():
        print(f"ERROR: {s} not found in metadata, dropped from dataframe", file=sys.stderr)
        todrop.append(s)
df.drop(todrop, inplace=True)
        
if metadata.shape[0] != df.shape[0]:
    print(f"ERROR: we have {metadata.shape[0]} rows in metadata and {df.shape[0]} data rows", file=sys.stderr)
metadata.head()

ERROR: 770590_20170925_S not found in data frame, dropped from metadata


Unnamed: 0_level_0,minion_ID,MGI_ID,pwCF_ID,Sample date,IP vs OP,Hospital,Room,Age,Age groups,Paediatric vs Adult,...,DNA Conc. (ng/ul),Index I7,Index I5,Mean_Size_BP,Total Clusters Passing Filter (Million),Unnamed: 167,Unnamed: 168,Unnamed: 169,Unnamed: 170,Unnamed: 171
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
623361_20180123_S,623361_20180123_S,623361_20180123_S,623361,1/23/2018,OP,RAH,Chest Clinic 9,18,4,Adult,...,33.6,GAGATTCC,GGCTCTGA,468,2.4,,,,,
639354_20171206_S,639354_20171206_S,639354_20171206_S,639354,12/06/2017,IP,WCH,Adolescent 10,17,3,Paediatric,...,20.8,ATTACTCG,TAATCTTA,337,4.9,,,,,
642660_20180601_S,642660_20180601_S,642660_20180601_S,642660,6/01/2018,IP,WCH,Adol Rm 11,17,3,Paediatric,...,21.8,GAATTCGT,GGCTCTGA,491,4.8,,,,,
650003_20180207_S,650003_20180207_S,650003_20180207_S,650003,2/07/2018,IP,WCH,Adol Room 1,17,3,Paediatric,...,8.28,GAATTCGT,ATAGAGGC,476,5.7,,,,,
658355_20171204_S,658355_20171204_S,658355_20171204_S,658355,12/04/2017,OP,WCH,Gilbert S Meeting,16,3,Paediatric,...,16.6,CTGAAGCT,AGGCGAAG,507,4.5,,,,,
