In [4]:
import pandas as pd
import numpy as np
import os
import subprocess
import datetime
from pathlib import Path
from screed import ScreedDB

In [5]:
metadata = pd.read_csv("../data/metadata_2021_04_08.tsv", sep="\t", parse_dates = ["Collection date", "Submission date"])

metadata = metadata[["Accession ID", \
                   "Collection date", \
                   "Submission date", \
                   "Location", \
                   "Additional location information", \
                   "Sequence length", \
                   "Host", \
                   "AA Substitutions", \
                   "Is reference?"]]
def get_nth_slash(row, n):
    try:
        return row.split("/")[n].strip()
    except:
        return np.nan
    
for i in range(4):
    metadata[f"Location_{i}"] = metadata["Location"].apply(lambda row: get_nth_slash(row, i))
metadata = metadata.rename(columns={"Location_1":"country", "Location_2":"state"})

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
def prep_gengraph_input_output(date, end_date, seq_names):
    date = date.strftime("%Y-%m-%d")
    end_date = end_date.strftime("%Y-%m-%d")
    
    output_path = Path("/sfs/lustre/bahamut/scratch/jho5ze/bionets/covid/data/Houston_output")
    run_path = output_path / f"{date}_{end_date}"
    
    if not run_path.exists():
        run_path.mkdir()
        
    seq_path = Path("/sfs/lustre/bahamut/scratch/jho5ze/bionets/covid/data/Houston")
    seq_file_dest = run_path / f"{date}_{end_date}_seq_input.txt"
    with open(seq_file_dest, "w") as dest:
        dest.write("seq_name\taln_name\tseq_path\tannotation_path\n")
        for ix, seq in enumerate(seq_names):
            dest.write(f"{seq}\tseq_{ix}\t{seq_path / seq}.fasta\n")
            
    return run_path, seq_file_dest, f"{date}_{end_date}"

def run_pipeline(date, end_date, seq_names, exp_name, location="Houston"):
    location = location.replace(" ", "_")
    base_path = Path("/scratch/jho5ze/bionets/covid/")
    uncert_file = "uncert.csv"
    
    date = date.strftime("%Y-%m-%d")
    end_date = end_date.strftime("%Y-%m-%d")
    
    output_path = Path("/sfs/lustre/bahamut/scratch/jho5ze/bionets/covid/data/output")
    run_path = output_path / location / exp_name / f"{date}_{end_date}"
    uncert_file = run_path / "uncert.csv"
    
    
    if not run_path.exists():
        run_path.mkdir(parents=True)
        
    seq_path = Path("/sfs/lustre/bahamut/scratch/jho5ze/bionets/covid/data/Houston")
    
    command = f"sbatch {base_path / 'scripts/run_pipeline.sbatch'} {run_path} {uncert_file}".split()
#     command += [f"{seq_path / seq}.fasta" for seq in seq_names]
    command += [f"{seq}" for seq in seq_names]
    return command
#     subprocess.run(command)
#     with open(seq_file_dest, "w") as dest:
#         dest.write("seq_name\taln_name\tseq_path\tannotation_path\n")
#         for ix, seq in enumerate(seq_names):
#             dest.write(f"{seq}\tseq_{ix}\t{seq_path / seq}.fasta\n")
            
#     return run_path, seq_file_dest, f"{date}_{end_date}"

In [3]:
timeout = """Houston 2020-07-01_2020-07-07
Houston 2020-09-09_2020-09-15
Houston 2020-09-16_2020-09-22
Houston 2020-12-23_2020-12-29
Houston 2021-01-06_2021-01-12
Houston 2021-01-20_2021-01-26
Houston 2021-01-27_2021-02-02
Houston 2021-02-03_2021-02-09
Houston 2021-02-10_2021-02-16
Houston 2021-02-17_2021-02-23
Houston 2021-03-03_2021-03-09
Houston 2020-07-08_2020-07-14
Houston 2020-12-30_2021-01-05
Houston 2021-01-13_2021-01-19
Houston 2021-02-24_2021-03-02
New_York_City 2021-01-25_2021-01-31
New_York_City 2021-02-08_2021-02-14
New_York_City 2021-02-15_2021-02-21
New_York_City 2021-02-22_2021-02-28
New_York_City 2021-03-01_2021-03-07
New_York_City 2021-03-08_2021-03-14
New_York_City 2021-03-15_2021-03-21
New_York_City 2021-03-22_2021-03-28
New_York_City 2021-03-29_2021-04-04
Dane_County 2021-01-09_2021-01-15
Yakima_County 2021-01-19_2021-01-25
King_County 2021-01-15_2021-01-21
King_County 2021-01-22_2021-01-28
King_County 2021-01-29_2021-02-04
King_County 2021-02-05_2021-02-11
King_County 2021-02-26_2021-03-04""".split("\n")
timeout

['Houston 2020-07-01_2020-07-07',
 'Houston 2020-09-09_2020-09-15',
 'Houston 2020-09-16_2020-09-22',
 'Houston 2020-12-23_2020-12-29',
 'Houston 2021-01-06_2021-01-12',
 'Houston 2021-01-20_2021-01-26',
 'Houston 2021-01-27_2021-02-02',
 'Houston 2021-02-03_2021-02-09',
 'Houston 2021-02-10_2021-02-16',
 'Houston 2021-02-17_2021-02-23',
 'Houston 2021-03-03_2021-03-09',
 'Houston 2020-07-08_2020-07-14',
 'Houston 2020-12-30_2021-01-05',
 'Houston 2021-01-13_2021-01-19',
 'Houston 2021-02-24_2021-03-02',
 'New_York_City 2021-01-25_2021-01-31',
 'New_York_City 2021-02-08_2021-02-14',
 'New_York_City 2021-02-15_2021-02-21',
 'New_York_City 2021-02-22_2021-02-28',
 'New_York_City 2021-03-01_2021-03-07',
 'New_York_City 2021-03-08_2021-03-14',
 'New_York_City 2021-03-15_2021-03-21',
 'New_York_City 2021-03-22_2021-03-28',
 'New_York_City 2021-03-29_2021-04-04',
 'Dane_County 2021-01-09_2021-01-15',
 'Yakima_County 2021-01-19_2021-01-25',
 'King_County 2021-01-15_2021-01-21',
 'King_County 

In [14]:
seqs = 1
genomes = 1
if not (seqs ^ genomes): 
    print("yep")

yep


In [24]:
data_path = Path("/scratch/jho5ze/bionets/covid/data")

seqs = pd.read_csv(data_path / "houston_metadata.csv", header=None, parse_dates=[1])
seqs.columns = ["seq", "date"]

date_window = 7
date_step_size = 7

for date in pd.date_range(seqs.date.min(), seqs.date.max(), freq=f"{date_step_size}D"):
    end_date = date + np.timedelta64(date_window - 1, 'D')
    sub_seqs = seqs[(seqs.date >= date) & (seqs.date <= end_date)].seq
#     output_directory, seq_file, out_file = prep_gengraph_input_output(date, end_date, sub_seqs)
    experiment_name = f"window_{date_window}_step_{date_step_size}"
    stdate = date.strftime("%Y-%m-%d")
    stend_date = end_date.strftime("%Y-%m-%d")
    
#     if f"{stdate}_{stend_date}" in timeout:
#     command = run_pipeline(date, end_date, sub_seqs, experiment_name)
#     subprocess.run(command)
#     print(f"{stdate}_{stend_date}")
#     print("\t", len(sub_seqs))
#     break
#     print(" ".join(command))
#     break
        
#         command = f"sbatch {base_path / 'scripts/run_gengraph.sbatch'} {output_directory} {seq_file} {out_file}"
#         subprocess.run(command.split())
#     break


In [66]:
metadata[metadata.country == "USA"].Location_3.shape #value_counts()[:10]

(246062,)

In [74]:
metadata[metadata.country == "USA"].Location_3.isna().sum()

139992

In [48]:
metadata[metadata["Accession ID"] == "EPI_ISL_1303700"]

Unnamed: 0,Accession ID,Collection date,Submission date,Location,Additional location information,Sequence length,Host,AA Substitutions,Is reference?,Location_0,country,state,Location_3
830371,EPI_ISL_1303700,2021-02-24,2021-03-21,North America / USA / Texas / Houston,,29859,Human,"(NSP6_G107del,N_S194L,NSP6_S106del,Spike_E484K...",,North America,USA,Texas,Houston


In [8]:
msadb = ScreedDB("../data/msa_0408/usa_msa_0408.fasta")
# msadb_keys = msadb.keys()

In [20]:
top_k = 10
top_k_sequenced_us = metadata[metadata.country == "USA"].Location_3.value_counts()[:top_k].index
top_state_county = []
for i in top_k_sequenced_us:
    top_state = metadata[(metadata.country == "USA") & (metadata.Location_3 == i)].state.value_counts().index[0]
    print(i, ":", top_state)
    print()
    top_state_county.append((top_state, i))

Houston : Texas

New York City : New York

San Diego : California

Santa Clara County : California

Dane County : Wisconsin

Alameda County : California

Yakima County : Washington

King County : Washington

Orange County : California

Los Angeles County : California



In [21]:
top_state_county

[('Texas', 'Houston'),
 ('New York', 'New York City'),
 ('California', 'San Diego'),
 ('California', 'Santa Clara County'),
 ('Wisconsin', 'Dane County'),
 ('California', 'Alameda County'),
 ('Washington', 'Yakima County'),
 ('Washington', 'King County'),
 ('California', 'Orange County'),
 ('California', 'Los Angeles County')]

In [11]:
# location = "Houston"
"""
Need to redo Orange county (had a few NY sequences and a few others in there too)
"""
for state, location in top_k_sequenced_us:
#     if location == "Houston":
#         continue
        
#     if location !="King County": continue 
        
    location_metadata = metadata[(metadata.country == "USA") & (metadata.state == state) & (metadata.Location_3 == location)]
    location_metadata = location_metadata[["Accession ID", "Collection date"]]
    location_metadata.columns = ["seq", "date"]
#     location_metadata.date = pd.to_datetime(location_metadata.date, errors="coerce")
    location_metadata = location_metadata.dropna()
    
    date_window = 7
    date_step_size = 7

    for date in pd.date_range(location_metadata.date.min(), location_metadata.date.max(), freq=f"{date_step_size}D"):
        end_date = date + np.timedelta64(date_window - 1, 'D')
        sub_seqs = location_metadata[(location_metadata.date >= date) & (location_metadata.date <= end_date)].seq
        sub_seqs = [s for s in sub_seqs if s in msadb]
    #     output_directory, seq_file, out_file = prep_gengraph_input_output(date, end_date, sub_seqs)
        experiment_name = f"window_{date_window}_step_{date_step_size}"
        stdate = date.strftime("%Y-%m-%d")
        stend_date = end_date.strftime("%Y-%m-%d")
#         if f"{stdate}_{stend_date}" != "2020-12-25_2020-12-31": continue
        
        command = run_pipeline(date, end_date, sub_seqs, experiment_name, location)
        
        if len(sub_seqs) > 0:
            if location.replace(" ", "_") + " " + f"{stdate}_{stend_date}" in timeout:
                print(location, stdate, len(sub_seqs))
                subprocess.run(command)
    #     print(command)
#         break
#     break
#         if len(sub_seqs) > 0:
#             subprocess.run(command)
#     break

Houston 2020-07-01 1828
Houston 2020-07-08 1203
Houston 2020-09-09 135
Houston 2020-09-16 156
Houston 2020-12-23 546
Houston 2020-12-30 1645
Houston 2021-01-06 1500
Houston 2021-01-13 1709
Houston 2021-01-20 1319
Houston 2021-01-27 535
Houston 2021-02-03 1001
Houston 2021-02-10 446
Houston 2021-02-17 813
Houston 2021-02-24 1163
Houston 2021-03-03 526
New York City 2021-01-25 110
New York City 2021-02-08 702
New York City 2021-02-15 852
New York City 2021-02-22 980
New York City 2021-03-01 669
New York City 2021-03-08 1306
New York City 2021-03-15 696
New York City 2021-03-22 1343
New York City 2021-03-29 241
Dane County 2021-01-09 68
Yakima County 2021-01-19 27
King County 2021-01-15 192
King County 2021-01-22 84
King County 2021-01-29 34
King County 2021-02-05 44
King County 2021-02-26 36
