In [55]:
import os
import sqlite3
import random
import pandas as pd
import copy

random.seed(1234)

In [2]:
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
database = os.path.join(project_dir, "results", "sqlite_db", "yersinia_pestis_db.sqlite")
conn = sqlite3.connect(database)
cur = conn.cursor()

## 1 Sample Per Branch

In [3]:
query = """
    SELECT
      BioSampleAccession,
      BioSampleBranch,
      BioSampleComment
    FROM
      BioSample
    LEFT Join
      Assembly ON BioSampleAccession==AssemblyBioSampleAccession
    WHERE
      BioSampleComment LIKE '%KEEP%' AND BioSampleComment NOT LIKE "%Local%"
    """
    

result = cur.execute(query).fetchall()

# Create a mapping of branches to samples
branch_dict = {}
for rec in result:
    accession = rec[0]
    branch = rec[1]
    comment = rec[2]
    origin = "ancient" if "Ancient" in comment else "modern"
    # Strip sub branches that are letter designations
    while branch[-1].isalpha():
        branch = branch[:-1]
    if branch not in branch_dict:
        branch_dict[branch] = []        
    branch_dict[branch].append((accession, origin))        

random_samples = {}
# Randomly sample
for branch in branch_dict:
    sample = random.choice(branch_dict[branch])
    random_samples[branch] = sample

# print out
for branch in random_samples:
    print(branch, random_samples[branch])

2.MED1 ('SAMN15063847', 'modern')
0.ANT1 ('SAMN07722911', 'modern')
4.ANT1 ('SAMN12991207', 'modern')
0.PE2 ('SAMN05521424', 'modern')
2.MED0 ('SAMN07176224', 'modern')
1.PRE1 ('SAMEA5818828', 'ancient')
1.PRE0 ('SAMEA5818806', 'ancient')
0.PE5 ('SAMN12138635', 'modern')
0.ANT4 ('SAMEA5661367', 'ancient')
1.ORI1 ('SAMN02403946', 'modern')
0.ANT3 ('SAMN02777961', 'modern')
2.ANT3 ('SAMN02769655', 'modern')
1.ANT1 ('SAMN02404404', 'modern')
1.ORI3 ('SAMN02470617', 'modern')
2.MED2 ('SAMN02403012', 'modern')
0.ANT2 ('SAMN02402994', 'modern')
1.ORI2 ('SAMN02403056', 'modern')
1.IN3 ('SAMN02403073', 'modern')
1.PRE2 ('SAMEA5054090', 'ancient')
0.PRE1 ('SAMEA104233048', 'ancient')
0.PE8 ('SAMEA104488961', 'ancient')
1.PRE3 ('SAMEA3937654', 'ancient')
3.ANT2 ('SAMN05521438', 'modern')
0.PE4 ('SAMN02403037', 'modern')
0.ANT5 ('SAMN05150065', 'modern')
2.ANT1 ('SAMN02403941', 'modern')
1.IN2 ('SAMN02403071', 'modern')
2.MED3 ('SAMN13907443', 'modern')
1.IN1 ('SAMN02403014', 'modern')
2.ANT2 ('S

## Explore Diversity

In [9]:
# Parse the metadata
metadata_path = os.path.join(project_dir, "results", "metadata", "all","metadata.tsv")

metadata_df = pd.read_csv(metadata_path, sep='\t')
metadata_df.fillna("NA", inplace=True)
metadata_df.set_index("sample", inplace=True)
metadata_df

Unnamed: 0_level_0,strain,date,date_bp,country,province,country_lat,country_lon,province_lat,province_lon,biovar,branch_major,branch_minor,biosample_accession,biosample_comment,branch_number,continent
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
GCA_009670865.1_ASM967086v1_genomic,42012,1971,-50,China,Xinjiang,35.000074,104.999927,42.480495,85.463346,Antiqua,0.ANT,0.ANT1,SAMN07722853,KEEP: Assembly Modern Placement,0,Asia
GCA_008630435.1_ASM863043v1_genomic,C-719,1996,-25,Russia,Karachay-Cherkessia,64.686314,97.745306,43.736833,41.726799,Medievalis,2.MED,2.MED0,SAMN12721146,KEEP: Assembly Modern Placement,2,Europe
GCA_000170275.1_ASM17027v1_genomic,F1991016,1991,-30,China,Yunnan,35.000074,104.999927,25.0,102.0,Orientalis,1.ORI,1.ORI2,SAMN02404399,KEEP: Assembly Modern Morelli 2010 Cui 2013 Ke...,1,Asia
GCA_001294975.1_ASM129497v1_genomic,A-1804,1980,-41,Kyrgyzstan,Talas Region,41.508932,74.724091,42.445187,72.143105,Talassica,0.PE,0.PE4t,SAMN03861481,KEEP: Assembly Modern Kislichkina 2015 Keller ...,0,Asia
GCA_002981895.1_ASM298189v1_genomic,I-2231,1972,-49,Mongolia,Bayan-Ölgii,46.825039,103.849974,48.547008,89.854936,Ulegeica,0.PE,0.PE5,SAMN08625964,KEEP: Assembly Modern Kislichkina 2018b Keller...,0,Asia
GCA_003074535.1_ASM307453v1_genomic,5M,1952,-69,Kyrgyzstan,Issyk-Kul Region,41.508932,74.724091,42.061034,78.170732,Antiqua,0.ANT,0.ANT5,SAMN08866745,KEEP: Assembly Modern Kutyrev 2018 Keller 2019...,0,Asia
GCA_002005285.1_ASM200528v1_genomic,195/P,[1898:1950],[-123:-71],India,Maharashtra,22.351115,78.667743,18.906836,75.674158,Antiqua,2.ANT,2.ANT1,SAMN06161235,KEEP: Assembly Modern Placement,2,Asia
GCA_003074235.1_ASM307423v1_genomic,I-3244,1988,-33,Mongolia,Bayan-Ölgii,46.825039,103.849974,48.547008,89.854936,Antiqua,4.ANT,4.ANT1,SAMN05225370,KEEP: Assembly Modern Placement,4,Asia
GCA_000835005.1_ASM83500v1_genomic,Nairobi,1985,-36,Kenya,Nairobi,1.441968,38.431397,-1.303169,36.826061,Antiqua,1.ANT,1.ANT1,SAMN03256395,KEEP: Assembly Modern Placement,1,Africa
GCA_001188795.1_ASM118879v1_genomic,3067,[1966:1997],[-55:-24],Georgia,Samtskhe-Javakheti,41.680971,44.028738,41.526535,43.246957,Caucasica,0.PE,0.PE2,SAMN03943655,KEEP: Assembly Modern Zhgenti 2015 Keller 2019...,0,Asia


In [29]:
# Parse the distance matrix
dist_mat_path = os.path.join(project_dir, "results", "snippy_multi", "all","chromosome", "filter100", "snippy-multi.snps.dist")
dist_mat_df = pd.read_csv(metadata_path, sep='\t')
dist_mat_df.set_index(dist_mat_df.columns[0], inplace=True)

Unnamed: 0_level_0,GCA_000170275.1_ASM17027v1_genomic,GCA_000269145.1_ASM26914v1_genomic,GCA_000320465.1_ASM32046v1_genomic,GCA_000323485.1_ASM32348v1_genomic,GCA_000323505.1_ASM32350v1_genomic,GCA_000323565.1_ASM32356v1_genomic,GCA_000323625.1_ASM32362v1_genomic,GCA_000323845.1_ASM32384v1_genomic,GCA_000323945.1_ASM32394v1_genomic,GCA_000323965.1_ASM32396v1_genomic,...,GCA_001294975.1_ASM129497v1_genomic,GCA_002005285.1_ASM200528v1_genomic,GCA_002981895.1_ASM298189v1_genomic,GCA_003074235.1_ASM307423v1_genomic,GCA_003074535.1_ASM307453v1_genomic,GCA_008630435.1_ASM863043v1_genomic,GCA_009670865.1_ASM967086v1_genomic,GCA_015158755.1_ASM1515875v1_genomic,GCA_016102905.1_ASM1610290v1_genomic,Reference
snp-dists 0.7.0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_000170275.1_ASM17027v1_genomic,0,0,77,504,82,126,183,498,115,32,...,204,116,192,75,85,123,128,124,138,1
GCA_000269145.1_ASM26914v1_genomic,0,0,77,501,81,126,180,496,115,32,...,200,116,188,72,82,120,123,121,135,1
GCA_000320465.1_ASM32046v1_genomic,77,77,0,413,22,68,105,409,59,40,...,115,60,102,11,1,60,39,58,70,76
GCA_000323485.1_ASM32348v1_genomic,504,501,413,0,445,472,311,0,441,444,...,327,486,337,426,417,470,390,472,496,505
GCA_000323505.1_ASM32350v1_genomic,82,81,22,445,0,70,125,439,60,44,...,136,64,124,11,23,64,62,60,73,81
GCA_000323565.1_ASM32356v1_genomic,126,126,68,472,70,0,168,468,2,85,...,181,62,171,61,71,11,108,58,1,125
GCA_000323625.1_ASM32362v1_genomic,183,180,105,311,125,168,0,310,148,140,...,6,166,19,118,110,166,71,163,176,182
GCA_000323845.1_ASM32384v1_genomic,498,496,409,0,439,468,310,0,434,441,...,322,479,333,418,410,461,384,463,486,499
GCA_000323945.1_ASM32394v1_genomic,115,115,59,441,60,2,148,434,0,72,...,165,57,158,54,64,5,99,49,2,115
GCA_000323965.1_ASM32396v1_genomic,32,32,40,444,44,85,140,441,72,0,...,150,75,138,31,41,77,79,75,89,31


In [76]:
branch_dict = {}
samples_df = copy.deepcopy(metadata_df)

for rec in metadata_df.iterrows():
    sample = rec[0]
    branch = rec[1]["branch_minor"]
    
    # Remove the subclade letter
    while branch[-1].isalpha():
        branch = branch[:-1]
        
    if branch not in branch_dict:
        branch_dict[branch] = []

    branch_dict[branch].append(sample)
    
for branch in branch_dict:
    samples = branch_dict[branch]
    # there's only one sample, don't edit dataframe
    if len(branch_dict[branch]) == 1:
        continue
    # Get metadata for branch
    print(branch)
    for sample in samples:
        country = samples_df["country"][sample]
        date = samples_df["date"][sample]
        min_dist = min(dist_mat_df.loc[sample])
        print(sample, country, date, min_dist)
        break

#display(samples_df)

1.ORI3
GCA_000590535.1_EV_NIIEG_genomic Russia 1941 0
0.PE7
GCA_000323845.1_ASM32384v1_genomic China 1961 0
