# Correcting the metadata

There are a couple of issues with the metadata being mis-labeled, and this notebook just checks those.


In [16]:
import os
import sys

import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.colors import ListedColormap
import matplotlib.dates as mdates
import pandas as pd
import seaborn as sns
import json

from itertools import cycle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from adjustText import adjust_text

# there is a FutureWarning in sklearn StandardScalar which is really annoying. This ignores it.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [17]:
def read_taxonomy(tax_file, firstchar):
    """
    Read the taxonomy file and return a data frame
    """
    
    df = pd.read_csv(tax_file, sep='\t', compression='gzip')
    df = df[df['taxonomy'].str.contains('k__Bacteria')]
    df = df[~df['taxonomy'].str.endswith(f'{firstchar}__')]
    df = df.set_index('taxonomy')
    df.index = df.index.str.split(';').str[-1]
    df = df.sort_index(axis=1)
    return df

def sorted_presence_absence(df1, df2, minrowsum=0, asc_sort=False):
    """
    Join the two tables and return the sorted version
    """
    # filter so we only include samples sequenced on both MGI and MinION
    common_columns = df1.columns.intersection(df2.columns)
    df1_both = df1[common_columns]
    df2_both = df2[common_columns]
    
    # create a presence/absence matrix
    df1_presence = (df1_both > 0).astype(int)
    df2_presence = (df2_both > 0).astype(int)*2

    # here we filter on the minimum number of columns each taxa is in if requested
    if minrowsum > 0:
        df1_presence = df1_presence.loc[df1_presence[df1_presence.sum(axis=1) > minrowsum].index]
        df2_presence = df2_presence.loc[df2_presence[df2_presence.sum(axis=1) > (2 * minrowsum)].index]
    
    # combine the two matrices and sort them
    both = df1_presence.add(df2_presence, fill_value=0)
    sboth = both.loc[both.sum(axis=1).sort_values(ascending=asc_sort).index]
    sboth = sboth.sort_index(axis=1) # sort by column names

    return sboth

In [18]:
corrections = {
    "MGI_ID" : { 
        '1112926_20171212_S' : '1447437_20171212_S',
        '1128691_20170206_S' : '1128691_20171206_S',
        '1255498_20171212_S' : '1590009_20171212_S',
        '1316979_20171215_S' : '1651490_20171215_S',
        '1598281_20180508_S' : '1588281_20180508_S',
        '1723809_20180227_S' : '1085876_20180227_S',
        '649354_20170206_S' : '639354_20171206_S',
        '652927_20180226_S' : '715927_20180226_S',
        '658355_20180301_S' : '658355_20180327_S',
        '777851_20170918_S' : '778851_20170918_S',
        '788707_20181126_S' : '788707_20181129_S'
    },
    "minion_ID" : {
        '1112926_20171212_S' : '1447437_20171212_S',
        '1255498_20171212_S' : '1590009_20171212_S',
        '1316979_20171215_S' : '1651490_20171215_S',
        '1598281_20180508_S' : '1588281_20180508_S',
        '698917_20190119_S' : '698917_20180119_S'
        }
}

In [20]:
tax='genus'
# read the data 
mgi_df = read_taxonomy(f"../MGI/Taxonomy/MGI_reads_{tax}.normalised.tsv.gz", tax[0])
min_df = read_taxonomy(f"../MinION/Taxonomy/Minion_read_based_annotations_{tax}.normalised.tsv.gz", tax[0])

sequence_type = 'MGI_ID'
print("Checking corrections keys", file=sys.stderr)
# check the columns
for s in corrections[sequence_type]:
    if s in mgi_df.columns:
        print(f"Old name:{s} in {sequence_type}", file=sys.stderr)

        
sequence_type = 'minion_ID'
print("Checking corrections keys", file=sys.stderr)
# check the columns
for s in corrections[sequence_type]:
    if s in min_df.columns:
        print(f"Old name:{s} in {sequence_type}", file=sys.stderr)
        
mgi_df = mgi_df.rename(columns=corrections['MGI_ID'])
min_df = min_df.rename(columns=corrections['minion_ID'])

#df = mgi_df.T
df = min_df.T
df.head()


In [21]:
print(", ".join(list(df.index)))

In [22]:
# Load metadata table
# This is generic so we can copy/paste. Yes, it should be a function, but its not.

# metadata = pd.read_csv("../Metadata/Metadata20241026.txt", encoding='utf-8', sep="\t", index_col=0)
metadata = pd.read_csv("../Metadata/Metadata.txt", encoding='windows-1252', sep="\t", index_col=0)
metadata = metadata[~metadata[sequence_type].isna()]


In [23]:
print(", ".join(list(metadata.index)))

In [25]:
s='623361_20180123_S'
metadata.loc[s,]

In [26]:

# check the names before we replace them
for s in corrections['MGI_ID']:
    if s in set(metadata[sequence_type].values):
        print(f"{s} in MGI Metadata", file=sys.stderr)



for ix in metadata.index:
    s = metadata.loc[ix, sequence_type]
    if s in corrections[sequence_type]:
        metadata.loc[ix, sequence_type] = corrections[sequence_type][s]

todrop = []
for s in metadata[sequence_type].values:
    if s not in df.index:
        print(f"ERROR: {s} not found in data frame, dropped from metadata", file=sys.stderr)
        todrop.append(s)

metadata.drop(todrop, inplace=True)

todrop = []
for s in df.index:
    if not metadata[sequence_type].str.contains(s).any():
        print(f"ERROR: {s} not found in metadata, dropped from dataframe", file=sys.stderr)
        todrop.append(s)
df.drop(todrop, inplace=True)
        
if metadata.shape[0] != df.shape[0]:
    print(f"ERROR: we have {metadata.shape[0]} rows in metadata and {df.shape[0]} data rows", file=sys.stderr)
metadata.head()