In [1]:
from typing import List


def read_file(file: str) -> List[str]:
    """Reads a file and returns a list of lines"""
    with open(file, "r") as f:
        return f.readlines()


def split_fasta(fasta: str, n: int) -> List[str]:
    """Splits a fasta file into n parts"""
    length = len(fasta)
    split_length = length // n
    split_fasta = []
    for i in range(n):
        print(f"Splitting {i + 1} of {n}")
        begin_of_next_sequence = fasta.find(">", (i + 1) * split_length)
        end = False
        if begin_of_next_sequence == -1:
            begin_of_next_sequence = length
            end = True
        split_fasta.append(fasta[i * split_length : begin_of_next_sequence])
        if end:
            break
    return split_fasta


def split_fasta_by_size(fasta: str, size: int) -> List[str]:
    """Splits a fasta file into parts with a maximum size"""
    length = len(fasta)
    split_fasta = []
    n = length // size + 1
    for i in range(n):
        print(f"Splitting {i + 1} of {n}")
        min_from = i * size
        max_to = (i + 1) * size
        to = fasta[:max_to].rfind(">") if i > 0 else 0
        from_ = fasta[:min_from].rfind(">") + min_from
    return split_fasta


def split_single_sequence(fasta: str, n: int) -> List[str]:
    """Splits a fasta file containing one sequence into n sequences"""
    name_start = fasta.find(">")
    name_end = fasta.find("\n")
    name = fasta[name_start:name_end]
    sequence = fasta[name_end + 1 :]
    length = len(fasta)
    split_length = length // n
    split_fasta = []
    for i in range(n):
        print(f"Splitting {i + 1} of {n}")
        start = i * split_length
        end = (i + 1) * split_length if i < n - 1 else length
        split_fasta.append(f"{name}\n{sequence[start:end]}")
    return split_fasta


def write_file(name, content):
    """Writes a file"""
    with open(name, "w") as f:
        f.write(content)

In [3]:
file = "MI_contigs_canu"
fasta = read_file(f"data/{file}.fa")
split_contigs = split_fasta(fasta, 10)
for i, subfasta in enumerate(split_contigs):
    write_file(f"output/blast/{file}_{i + 1}.fa", subfasta)

file = "tig00000533_BLAST_X"
fasta = read_file(f"data/{file}.fa")
split_x = split_single_sequence(fasta, 7)
for i, subfasta in enumerate(split_x):
    write_file(f"output/blast/{file}_{i + 1}.fa", subfasta)

file = "tig00000533_BLAST_Y"
fasta = read_file(f"data/{file}.fa")
split_y = split_single_sequence(fasta, 5)
for i, subfasta in enumerate(split_y):
    write_file(f"output/blast/{file}_{i + 1}.fa", subfasta)

Splitting 1 of 10


AttributeError: 'list' object has no attribute 'find'