## Load in modules

In [1]:
import math
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, timedelta
import json
import matplotlib.pyplot as plt
import geopandas as gpd
import shapely
import infomap
import networkx as nx
from ete3 import Tree


## Read in extracted TLs from MCC tree for all subtrees

In [3]:
mcc_tls = pd.read_csv('./data/mcc_tls/mcc_tls.tsv', sep='\t').iloc[:,2:]
mcc_tls = mcc_tls.loc[(mcc_tls.ntaxa > 0) & (mcc_tls.source == 'nonENG')]
mcc_tls.taxa = mcc_tls.taxa.apply(lambda x: x.split('; '))
mcc_tls['cog_ids'] = mcc_tls.taxa.apply(lambda x: [xx.split('|')[0].split('/')[1] for xx in x])
mcc_tls['duration'] = mcc_tls.apply(lambda row: row.last_seen - row.tmrca, axis=1)
mcc_tls['import_est'] = mcc_tls.apply(lambda row: (row.tmrca + row.ptmrca)/2, axis=1)
mcc_tls = mcc_tls.sort_values('ntaxa', ascending=False)


## Read in COG-UK geography metadata

In [5]:
## LTLA of English genomes present in MCC
mcc_geo_dat = pd.read_csv('./data/mcc.geography.csv', sep=',')
mcc_cog_pc = dict(zip(mcc_geo_dat.cog_id.values, mcc_geo_dat.outer_postcode.values))


### LTLAs to ignore

In [8]:
## ignore Isles of Scilly, Isles of Wright, City of London, and South Tyneside
ignore_ltlas = [
    'E06000053', 'E09000001', 'E08000023', 'E06000046'
]


In [None]:
## function count number of genomes present in each LTLA
def get_ltla_distr(cog_ids, cog_pc_map, keep_omitted=False):
    ltla_distr = {}
    for cog_id in cog_ids:
        pc = cog_pc_map[cog_id]
        ltla = pc_ltla[pc] if pc in pc_ltla else 'NA'
        if keep_omitted or (not keep_omitted and ltla not in ignore_ltlas):
            if ltla in ltla_distr:
                ltla_distr[ltla] += 1
            else:
                ltla_distr[ltla] = 1

    return ltla_distr
      
mcc_tls['ltla_distr_keep_omitted'] = mcc_tls.cog_ids.apply(
    lambda x: get_ltla_distr(x, mcc_cog_pc, keep_omitted=True))


## Read in LTLA neighbour list

In [16]:
ltla_nbs = pd.read_csv('./data/ltla_neighbours_v4_ACTIVE.tsv', sep='\t')
ltla_nbs = dict(zip(ltla_nbs.ltla_code.values, [x.split(',') for x in ltla_nbs.neighbours.values]))


## Aggregate neighbouring LTLAs starting from LTLAs with fewest sequence samples
1. look for LTLA with fewer sequence samples
2. look for neighbouring LTLAs with the most sequence samples and merge
3. update new merged LTLAs
4. go back to step 1 and iterate

Operate on dictionaries, keep track of mergers

In [22]:
## function to perform LTLAs aggregation
## set maximum number of geographical units to 253
def aggr_ltlas(ltla_distr, nlim=250):
    mergers = {}
    ltla_distr_cpy = { k: v for k, v in ltla_distr.items() }
    seqs_moved = 0
    while len(ltla_distr_cpy) > nlim:
        if len(ltla_distr) <= nlim:
            return ltla_distr
        else:
            ## find LTLA with fewest genome sample(s), ignoring omitted LTLAs
            min_ltla_num = sorted(list(filter(lambda x: x[0] not in ignore_ltlas,
                                              [(k, v) for k, v in ltla_distr_cpy.items()])),
                                  key=lambda x: x[1])
            for ltla_num in min_ltla_num:
                ## find adjacent LTLA with most genome sample(s)
                ## no omitted LTLAs in neighbour list, so don't worry
                nbs = ltla_nbs[ltla_num[0]]
                max_nb = sorted([(nb, ltla_distr_cpy[nb]
                                  if nb in ltla_distr_cpy else -1) for nb in nbs], key=lambda x: x[1])
                max_nb = list(filter(lambda x: x[1] > 0, max_nb))
                if len(max_nb):
                    max_nb = max_nb[-1]
                    ltla_distr_cpy[max_nb[0]] += ltla_num[1]
                    del ltla_distr_cpy[ltla_num[0]]
                    
                    ## check if LTLA to be merged into is already in another merger(s)
                    ## if yes, then we want to add new LTLA into the same merger
                    if max_nb[0] in mergers:
                        mergers[max_nb[0]].append(ltla_num[0])
                    else:
                        mergers[max_nb[0]] = [ltla_num[0]]

                    ## check if LTLA has nested mergers
                    ## if yes, then we want to add those nested mergers into the same merger
                    if ltla_num[0] in mergers:
                        mergers[max_nb[0]] += mergers[ltla_num[0]]
                        del mergers[ltla_num[0]]
                                        
                    seqs_moved += ltla_distr[ltla_num[0]] ## add original number of sequence (excluding collapsed)
                    break
            else: ## if no merger was found
                return {'results': 'initial LTLA(s): %d, final LTLA(s): %d (failure)' % \
                        (len(ltla_distr), len(ltla_distr_cpy)), 'seqs_moved': seqs_moved,
                        'mergers': mergers, 'ltla_distr': ltla_distr_cpy}
                    
    return {'results': 'initial LTLA(s): %d, final LTLA(s): %d (success)' % \
            (len(ltla_distr), len(ltla_distr_cpy)), 'seqs_moved': seqs_moved,
            'mergers': mergers, 'ltla_distr': ltla_distr_cpy}
    
mergers = aggr_ltlas(mcc_tl.ltla_distr_keep_omitted.values[0], nlim=253)
mergers


{'results': 'initial LTLA(s): 313, final LTLA(s): 253 (success)',
 'seqs_moved': 566,
 'mergers': {'E07000141': ['E06000017'],
  'E06000054': ['E07000187', 'E07000079'],
  'E07000130': ['E07000133', 'E07000132', 'E07000134'],
  'E07000244': ['E07000203', 'E07000145'],
  'E06000025': ['E07000080', 'E07000082'],
  'E06000052': ['E07000046', 'E07000047'],
  'E07000144': ['E07000147', 'E07000143'],
  'E06000016': ['E07000135', 'E07000131'],
  'E06000047': ['E07000166'],
  'E07000064': ['E07000062'],
  'E07000011': ['E07000009'],
  'E07000139': ['E07000136'],
  'E08000019': ['E07000035'],
  'E07000070': ['E07000075', 'E07000074', 'E07000077', 'E07000068'],
  'E07000246': ['E07000042', 'E07000043'],
  'E06000031': ['E07000152', 'E07000140'],
  'E07000044': ['E07000045', 'E06000027'],
  'E07000154': ['E07000151'],
  'E07000065': ['E07000061'],
  'E07000242': ['E07000073'],
  'E07000071': ['E07000200'],
  'E08000025': ['E07000218'],
  'E06000051': ['E07000235'],
  'E08000032': ['E07000163'],
 

In [24]:
## remap mergers so that keys correspond to LTLAs that have been merged,
## and values correspond to parent LTLAs that have been merged into
mergers_rev = {}
for k, v in mergers['mergers'].items():
    for vv in v:
        mergers_rev[vv] = k
    

In [21]:
## output mergers to file
with open('./output/mergers_rev.tsv', 'w+') as outfile:
    outfile.write('child_ltla\tparent_ltla\n')
    outfile.write('\n'.join(['%s\t%s' % (k, v) for k, v in mergers_rev.items()]))
    