# Combine all fasta into one big .hdf5 file
@author Harald Ringbauer, March 2020

In [2]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket
import matplotlib.pyplot as plt
from itertools import groupby
from shutil import which
import os
import re as re

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(f"Current machine: {socket_name}")
if socket_name == "DESKTOP-5RJD9NC":
    path = "/gitProjects/covid19_data"   # The Path on Harald's machine
if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/covid19_data/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
### Check whether required bins are available
req_bins = ["mafft"] 
for b in req_bins:
    s = which(b)
    if not s:
        print(f"Make sure to install {b} and have in path. I cannot find it!")
        
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd())

sys.path.append("./python3/")
from manipulate_fasta import fasta_iter_raw, fasta_iter

Current machine: compute-e-16-237.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/covid19_data


## Define Key Functions

In [10]:
def load_aligned_seq(fasta_path, align_id="Wuhan-Hu-1"):
    """Load Aligned Sequence fasta.
    Check against align_id and lgth.
    Return String"""
    fiter = fasta_iter(fasta_path)
    iid_re, _ = next(fiter)
    
    if len(align_id)>0:
        if not (align_id in iid_re):
            raise RuntimeWarning(f"Reference ID {align_id} does not match {iid_re}")
    iid, seq = next(fiter)  # Get the Meat
    return iid, seq

def combine_fasta_alignments(paths, align_id="Wuhan-Hu-1"):
    """Load, and combine all the paths """
    n = len(paths)
    
    _, seq = load_aligned_seq(paths[0], align_id=align_id)
    k = len(seq)
    
    seqs = np.empty((n,k), dtype="|S1")  # Create place holder for all sequences
    iids = np.empty(n, dtype="str") # Place holder for the iids
    
    for i, path in enumerate(paths):
        iid, seq = load_aligned_seq()
        iid[i], seq[i,:] = iid, seq
    return iids, seqs

# Run and Combine

In [35]:
aligned_path_fasta = "./output/single_seq_aligned.tsv"

df = pd.read_csv(aligned_path_fasta, sep="\t")
df = df[df["include"]==True].copy()
print(f"Loaded {len(df)} Sequences from {aligned_path_fasta}")

### Manually filling in (next iteration has it automatic)
df["aligned_path"] = "./output/singleseq_aligned/" + df["iid_clean"] + ".fasta"

Loaded 1746 Sequences from ./output/single_seq_aligned.tsv


In [13]:
df["aligned_path"].value_counts()

Series([], Name: aligned_path, dtype: int64)

In [37]:
paths = df["aligned_path"][:5]

iids, seqs = combine_fasta_alignments(paths, align_id="Wuhan-Hu-1")

AttributeError: module 'numpy' has no attribute 'emppty'

# Area 51

In [9]:
path_fasta = "./output/singleseq_aligned/hCoV-19.Beijing.105.2020.EPI_ISL_413518.2020-01-26.fasta"
#fiter = fasta_iter(path_fasta)
#iids = np.array([ff[0] for ff in fiter])
#next(fiter)

In [19]:
iids

array(['MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome',
       'hCoV-19/Beijing/105/2020|EPI_ISL_413518|2020-01-26'], dtype='<U94')

In [None]:
path_fasta = "./output/singleseq_aligned/hCoV-19.Beijing.105.2020.EPI_ISL_413518.2020-01-26.fasta"
#fiter = fasta_iter(path_fasta)
load_aligned_seq(path_fasta, align_id="Wuhan-Hu-1")

In [28]:
df = pd.DataFrame({"iid":["a", "b", "c"]})
df["b"] = ""

In [29]:
for index, row in df.iterrows():
    print(row["iid"])
    row["iid"] = "d"
    row["b"] = "e"

a
b
c


In [30]:
df

Unnamed: 0,iid,b
0,d,e
1,d,e
2,d,e
