# Align Fasta to Reference Fasta

In [19]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket
import matplotlib.pyplot as plt
from itertools import groupby
import os

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(f"Current machine: {socket_name}")
if socket_name == "DESKTOP-5RJD9NC":
    path = "/gitProjects/covid19_data"   # The Path on Harald's machine
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd())

Current machine: DESKTOP-5RJD9NC
D:\gitProjects\covid19_data


### Clean Fasta File (remove is)

In [61]:
def write_fasta(sample_line, seq_lines, savepath=""):
    """Write a single Fasta file.
    sample_line: Line of the Sample to write
    seq_lines: Genotype Sequences to write
    savepath: Where to write Fasta File to
    """
    f1 = open(savepath, "w")
    f1.write(sample_line)  # Write the new line
    for line in seq_lines:
        f1.write(line)
    f1.close()
    print(f"Saved fasta to {savepath}")
    
def fasta_iter_raw(fasta_name):
    """
    Return iterator for fasta
    """
    
    "first open the file outside "
    fh = open(fasta_name)
    # ditch the boolean (x[0]) and just keep the header or sequence since
    # we know they alternate.
    faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))

    for header in faiter:
        # drop the ">"
        headerStr = header.__next__()   #[1:].strip()
        # join all sequence lines to one.
        seq = "".join(s for s in faiter.__next__())  # .strip()
        yield (headerStr, seq) 
        
def fasta_iter(fasta_name):
    """
    Return iterator for fasta
    """
    
    "first open the file outside "
    fh = open(fasta_name)
    # ditch the boolean (x[0]) and just keep the header or sequence since
    # we know they alternate.
    faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))

    for header in faiter:
        # drop the ">"
        headerStr = header.__next__()[1:].strip()
        # join all sequence lines to one.
        seq = "".join(s.strip() for s in faiter.__next__())
        yield (headerStr, seq) 
    
def split_fasta(fasta_name, path_out="./output/singleseq/", 
                iids=[], clean=False):
    """
    Splits up samples from fasta if IID string contains
    """
    fiter = fasta_iter_raw(fasta_name)
    for sample_line, seq_line in fiter:
        sample = sample_line[1:].strip()
        if sample in iids:
            print(f"Matched: {sample}")
            sample1 = sample.replace("/", ".")
            sample1 = sample1.replace("|", ".")
            fasta_name_out = os.path.join(path_out, sample1 + ".fasta")
            if clean:
                  raise NotImplemented("Implement this")
            write_fasta(sample_line, seq_line, fasta_name_out)
    
                  
def clean_fasta(fasta_name, fasta_name_out):
    """
    Remove invalid characters in sequences
    """
    "first open the file outside "
    f = open(fasta_name)
    f1 = open(fasta_name_out, "w")
    n=0
    
    for line in f:
        if line[0]==">":
            n+=1
        else:   # If Genome Data replace i
            line = line.replace("i", "N")
        f1.write(line)  # Write the new line
    f.close()
    f1.close()
    print(f"Successfully Modified {n} fastas")
    print(f"Saved to {clean_fasta_path}")

In [5]:
%%time
bin_path = "bins/mafft-win/mafft.bat"
out_path = "./output/aligned/test_out.fasta"
ref_path = "./data/reference/wuhan-hu-1.fasta"

fasta_path = "./data/mar20/gisaid_cov2020_sequences_26.fasta"
clean_fasta_path = "./data/mar20/gisaid_cov2020_sequences_26_c.fasta"
threads = 4

clean_fasta(fasta_path, clean_fasta_path)

Successfully Modified 1871 fastas
Saved to ./data/mar20/gisaid_cov2020_sequences_26_c.fasta
Wall time: 1.72 s


### Extract Single Sequence Fasta file

In [47]:
%%time
fasta_name_out =  "./output/singleseq/test.fasta"
split_fasta(fasta_name = "./data/mar20/gisaid_cov2020_sequences_26.fasta",
            fasta_name_out = fasta_name_out)

Saved fasta to ./output/singleseq/test.fasta
Wall time: 29.9 ms


In [50]:
def align_sequence(bin_path, out_path, fasta_name_out, 
                   ref_path, bat_path="./bins/mafft_batch.bat"):
    """Produce a batch file and runs bafft"""
    command = f"{bin_path} --out {out_path} --add {fasta_name_out} --mapout {ref_path}"
    #command="echo test"
    print("running command:")
    print(command)
    stream = os.popen(command)
    output = stream.read()
    return output

In [49]:
align_sequence(bin_path, out_path, fasta_name_out, ref_path)

running command:
bins/mafft-win/mafft.bat --out ./output/aligned/test_out_single.fasta --addfull ./output/singleseq/test.fasta --mapout ./data/reference/wuhan-hu-1.fasta


''

### Align all fasta files
Idea: Produce batch script that aligns them one by one

In [48]:
bin_path = "bins/mafft-win/mafft.bat"
out_path = "./output/aligned/test_out.fasta"
ref_path = "./data/reference/wuhan-hu-1.fasta"
clean_fasta_path = "./data/mar20/gisaid_cov2020_sequences_26_c.fasta"
thread = 4

In [49]:
print("Run this command in (power) shell:")
print(f"{bin_path} --out {out_path} --add {clean_fasta_path} --thread {thread} --mapout {ref_path} --keeplength")

Run this command in (power) shell:
bins/mafft-win/mafft.bat --out ./output/aligned/test_out.fasta --add ./data/mar20/gisaid_cov2020_sequences_26_c.fasta --thread 4 --mapout ./data/reference/wuhan-hu-1.fasta


# Area 51

In [67]:
fiter = fasta_iter("./data/mar20/gisaid_cov2020_sequences_26.fasta")
iids = np.array([ff[0] for ff in fiter])
fiter = fasta_iter("./data/mar20/gisaid_cov2020_sequences_26.fasta")
seqs = np.array([ff[1] for ff in fiter])

In [66]:
ll = [len(s) for s in seqs]
ll[734]

30129

In [68]:
iids[734]

'hCoV-19/USA/WA-UW182/2020|EPI_ISL_416720|2020-03-13'

In [54]:
sample_line, seq_line = next(fiter)

In [28]:
sample_line

'>hCoV-19/USA/WA-S88/2020|EPI_ISL_417141|2020-03-01\n'

In [42]:
len(seq_line)

30277

In [70]:
split_fasta(fasta_name="./data/mar20/gisaid_cov2020_sequences_26.fasta", 
            path_out="./output/singleseq/", 
            iids=['hCoV-19/USA/WA-UW182/2020|EPI_ISL_416720|2020-03-13'], 
            clean=False)

Matched: hCoV-19/USA/WA-UW182/2020|EPI_ISL_416720|2020-03-13
Saved fasta to ./output/singleseq/hCoV-19.USA.WA-UW182.2020.EPI_ISL_416720.2020-03-13.fasta
