In [1]:
import os
import sqlite3
import random
import pandas as pd
import copy

random.seed(1234)

In [2]:
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
database = os.path.join(project_dir, "results", "sqlite_db", "yersinia_pestis_db.sqlite")
conn = sqlite3.connect(database)
cur = conn.cursor()

## 1 Sample Per Branch

In [3]:
query = """
    SELECT
      BioSampleAccession,
      BioSampleBranch,
      BioSampleComment
    FROM
      BioSample
    LEFT Join
      Assembly ON BioSampleAccession==AssemblyBioSampleAccession
    WHERE
      BioSampleComment LIKE '%KEEP%' AND BioSampleComment NOT LIKE "%Local%"
    """
    

result = cur.execute(query).fetchall()

# Create a mapping of branches to samples
branch_dict = {}
for rec in result:
    accession = rec[0]
    branch = rec[1]
    comment = rec[2]
    origin = "ancient" if "Ancient" in comment else "modern"
    # Strip sub branches that are letter designations
    while branch[-1].isalpha():
        branch = branch[:-1]
    if branch not in branch_dict:
        branch_dict[branch] = []        
    branch_dict[branch].append((accession, origin))        

random_samples = {}
# Randomly sample
for branch in branch_dict:
    sample = random.choice(branch_dict[branch])
    random_samples[branch] = sample

# print out
for branch in random_samples:
    print(branch, random_samples[branch])

2.MED1 ('SAMN15063847', 'modern')
0.ANT1 ('SAMN07722911', 'modern')
4.ANT1 ('SAMN12991207', 'modern')
0.PE2 ('SAMN05521424', 'modern')
2.MED0 ('SAMN07176224', 'modern')
1.PRE1 ('SAMEA5818828', 'ancient')
1.PRE0 ('SAMEA5818806', 'ancient')
0.PE5 ('SAMN12138635', 'modern')
0.ANT4 ('SAMEA5661367', 'ancient')
1.ORI1 ('SAMN02403946', 'modern')
0.ANT3 ('SAMN02777961', 'modern')
2.ANT3 ('SAMN02769655', 'modern')
1.ANT1 ('SAMN02404404', 'modern')
1.ORI3 ('SAMN02470617', 'modern')
2.MED2 ('SAMN02403012', 'modern')
0.ANT2 ('SAMN02402994', 'modern')
1.ORI2 ('SAMN02403056', 'modern')
1.IN3 ('SAMN02403073', 'modern')
1.PRE2 ('SAMEA5054090', 'ancient')
0.PRE1 ('SAMEA104233048', 'ancient')
0.PE8 ('SAMEA104488961', 'ancient')
1.PRE3 ('SAMEA3937654', 'ancient')
3.ANT2 ('SAMN05521438', 'modern')
0.PE4 ('SAMN02403037', 'modern')
0.ANT5 ('SAMN05150065', 'modern')
2.ANT1 ('SAMN02403941', 'modern')
1.IN2 ('SAMN02403071', 'modern')
2.MED3 ('SAMN13907443', 'modern')
1.IN1 ('SAMN02403014', 'modern')
2.ANT2 ('S

## Explore Diversity

In [4]:
# Parse the metadata
metadata_path = os.path.join(project_dir, "results", "metadata", "all","metadata.tsv")

metadata_df = pd.read_csv(metadata_path, sep='\t')
metadata_df.fillna("NA", inplace=True)
metadata_df.set_index("sample", inplace=True)
metadata_df

Unnamed: 0_level_0,strain,date,date_bp,country,province,country_lat,country_lon,province_lat,province_lon,biovar,branch_major,branch_minor,biosample_accession,biosample_comment,branch_number,continent
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
GCA_009909635.1_ASM990963v1_genomic,9_10,1923,-98,Russia,Rostov Oblast,64.686314,97.745306,47.6222,40.7958,Medievalis,2.MED,2.MED1,SAMN13632815,KEEP: Assembly Modern,2,Europe
GCA_009669545.1_ASM966954v1_genomic,42126,2006,-15,China,Xinjiang,35.000074,104.999927,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722925,KEEP: Assembly Modern,0,Asia
GCA_009669555.1_ASM966955v1_genomic,42123,2005,-16,China,Xinjiang,35.000074,104.999927,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722924,KEEP: Assembly Modern,0,Asia
GCA_009669565.1_ASM966956v1_genomic,42118,2005,-16,China,Xinjiang,35.000074,104.999927,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722923,KEEP: Assembly Modern,0,Asia
GCA_009669605.1_ASM966960v1_genomic,42117,2005,-16,China,Xinjiang,35.000074,104.999927,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722922,KEEP: Assembly Modern,0,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMEA7313243_45,Azov38,[1400:1700],[-621:-321],Russia,Rostov Oblast,64.686314,97.745306,47.6222,40.7958,Second Pandemic,1.PRE,1.PRE1,SAMEA7313243_45,KEEP: SRA Ancient Combined Record,1,Europe
SAMEA7313246_49,Gdansk8,[1400:1700],[-621:-321],Poland,Pomeranian Voivodeship,52.215933,19.134422,54.2456,18.1099,Second Pandemic,1.PRE,1.PRE1,SAMEA7313246_49,KEEP: SRA Ancient Combined Record,1,Europe
SAMEA6651390,AGU010,[1435:1477],[-586:-544],Lithuania,Vilnius County,55.350000,23.750000,54.8227,25.2495,Second Pandemic,1.PRE,1.PRE1,SAMEA6651390,KEEP: SRA Ancient,1,Europe
SAMEA6637004,AGU025,[1441:1612],[-580:-409],Lithuania,Vilnius County,55.350000,23.750000,54.8227,25.2495,Second Pandemic,1.PRE,1.PRE1,SAMEA6637004,KEEP: SRA Ancient,1,Europe


In [5]:
# Parse the distance matrix
#dist_mat_path = os.path.join(project_dir, "results", "snippy_multi", "all","chromosome", "filter100", "snippy-multi.snps.dist")#
#dist_mat_df = pd.read_csv(metadata_path, sep='\t')
#dist_mat_df.set_index(dist_mat_df.columns[0], inplace=True)

In [12]:
branch_dict = {}
samples_df = copy.deepcopy(metadata_df)
TIME_WINDOW = 25 # exclude if dates are within this range
ANCIENT_DATE_THRESHOLD = 1900 # samples older than this are considered ancient
GEO = "province"
GEO_ALT = "country"

for rec in metadata_df.iterrows():      
    sample = rec[0]
    branch = rec[1]["branch_minor"]
    
    # Remove the subclade letter
    while branch[-1].isalpha():
        branch = branch[:-1]
        
    if branch not in branch_dict:
        branch_dict[branch] = {GEO : {},}
                
    geo_val = metadata_df[GEO][sample]
    if geo_val == "NA":
        geo_val = metadata_df[GEO_ALT][sample]

    date = samples_df["date"][sample].lstrip("[").rstrip("]")
    # If it's a range, take the mean
    date_split = [int(d) for d in date.split(":")]
    if len(date_split) > 1:
        date = sum(date_split) / len(date_split)
    date = int(date)
        
    # Add the country if it hasn't been observed
    if geo_val not in branch_dict[branch][GEO]:    
        branch_dict[branch][GEO][geo_val] = {"dates" : {date : sample}}
        continue
    
    # Keep all ancient samples
    if date < ANCIENT_DATE_THRESHOLD:
        branch_dict[branch][GEO][geo_val]["dates"][date] = sample
    
    else:
        # Compare dates
        i_date = 0
        # By default, assume we're adding the sample
        add_sample = True
        for c_date in branch_dict[branch][GEO][geo_val]["dates"]:
            date_diff = abs(date - c_date)
            # If the date difference is too small, exclude
            if date_diff < TIME_WINDOW:
                # How to resolve ties? Want to minimize terminal branch length
                add_sample = False
            i_date += 1
            
        if add_sample:           
            branch_dict[branch][GEO][geo_val]["dates"][date] = sample

sample_counter = 0
for branch in branch_dict:
    print()
    print(branch)
    for geo_val in branch_dict[branch][GEO]:
        print("\t", geo_val)
        for date in branch_dict[branch][GEO][geo_val]["dates"]:
            sample = branch_dict[branch][GEO][geo_val]["dates"][date]
            sample_counter += 1
            print("\t\t", date, sample)
            
print("Number of samples:", sample_counter)

#display(samples_df)


2.MED1
	 Rostov Oblast
		 1923 GCA_009909635.1_ASM990963v1_genomic
	 Chechnya
		 1953 GCA_009296005.1_ASM929600v1_genomic
	 Kabardino-Balkaria
		 1997 GCA_008630485.1_ASM863048v1_genomic
	 Republic of Dagestan
		 1984 GCA_008630395.1_ASM863039v1_genomic
	 Qazakh District
		 1984 GCA_008630575.1_ASM863057v1_genomic
	 Ingushetia
		 1970 GCA_006376535.1_ASM637653v1_genomic
	 Goranboy District
		 1968 GCA_006376495.1_ASM637649v1_genomic
	 Baku
		 1968 GCA_006376475.1_ASM637647v1_genomic
	 Shamkir District
		 1959 GCA_006376435.1_ASM637643v1_genomic
	 Stavropol Krai
		 1950 GCA_006376415.1_ASM637641v1_genomic
	 West Kazakhstan Region
		 1990 GCA_003086155.1_ASM308615v1_genomic
		 1950 GCA_016102755.1_ASM1610275v1_genomic
		 1917 GCA_016102575.1_ASM1610257v1_genomic
	 Astrakhan Oblast
		 1992 GCA_001617725.1_Yersinia_pestis_M-1484_genomic
		 1962 GCA_016102745.1_ASM1610274v1_genomic
		 1930 GCA_016102665.1_ASM1610266v1_genomic
	 Xinjiang
		 2005 GCA_004023595.1_ASM402359v1_genomic
	 Fizuli 