# 200625 Compare old/new genome signatures

In [1]:
from gzip import GzipFile

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from IPython.display import JSON

In [2]:
from midas.backports.signaturefile import SignatureFile

## Config

In [3]:
files = dict(
    oldsigs="/Users/student/projects/midas/data/reference-db-v2/refseq_assemblies_ATGAC11_2_0.midas-signatures.gz",
    newsigs="/Users/student/projects/midas/data/2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz",
)

## Load signatures

In [4]:
oldfile = SignatureFile(GzipFile(files['oldsigs']))
JSON(oldfile.get_metadata())

<IPython.core.display.JSON object>

In [5]:
newfile = SignatureFile(GzipFile(files['newsigs']))
JSON(newfile.get_metadata())

<IPython.core.display.JSON object>

## Find common genomes by accession no

In [6]:
oldaccs = list(oldfile.ids)
newaccs = [k.split('/')[-1] for k in newfile.ids]

In [7]:
common_accs = sorted(set(oldaccs) & set(newaccs))
len(oldaccs)-len(common_accs), len(oldaccs), len(common_accs), len(newaccs), len(newaccs)-len(common_accs)

(23596, 74160, 50564, 50752, 188)

In [8]:
common_idxs_old = [oldaccs.index(a) for a in common_accs]
oldsigs = oldfile.get_coords_collection(common_idxs_old)

In [9]:
common_idxs_new = [newaccs.index(a) for a in common_accs]
newsigs = newfile.get_coords_collection(common_idxs_new)

## Compare signatures for common genomes

In [10]:
diff_accs = set()

for i, acc in enumerate(tqdm(common_accs)):
    if not np.array_equal(oldsigs[i], newsigs[i]):
        diff_accs.add(acc)
        
len(diff_accs)

100%|██████████| 50564/50564 [01:08<00:00, 733.79it/s] 


1

There is one common genome for which the signatures differ. Look into exactly how much:

In [11]:
acc, = diff_accs
i = common_accs.index(acc)
acc

'GCF_000230875.1'

In [12]:
from midas.cython.metrics import jaccard_coords

In [13]:
1 - jaccard_coords(oldsigs[i], newsigs[i])

0.00037616491317749023

In [14]:
ni = len(set(oldsigs[i]) & set(newsigs[i]))
len(oldsigs[i]) - ni, ni, len(newsigs[i]) - ni

(0, 10629, 4)