## Create FCGR from fasta files

In [25]:
import numpy as np 
from tqdm import tqdm 
from Bio import SeqIO
from collections import namedtuple
from complexcgr import FCGR
from pathlib import Path

KMER = 8
N = 1000
fcgr = FCGR(k=KMER)

In [26]:
def preprocessing(seq):
    seq = seq.upper()
    for letter in "BDEFHIJKLMOPQRSUVWXYZ":
        seq = seq.replace(letter,"N")
    return seq

In [27]:
path_fasta = "../data/test-random.fasta"
path_save = Path(f"../data/fcgr-test/{KMER}mer/test-random")
path_save.mkdir(exist_ok=True, parents=True)

idx = 0
for record in tqdm(SeqIO.parse(path_fasta, format="fasta"), total=N):
    
    chaos = fcgr(record.seq)
    np.save(path_save.joinpath(f"{record.id}.npy"), arr=chaos)
    idx += 1

100%|██████████| 1000/1000 [00:24<00:00, 40.77it/s]


In [28]:
import pandas as pd
Seq = namedtuple("Seq",["id","name"])

for clade in ["G","S","O"]:
    path_fasta = f"../data/test-clade_{clade}.fasta"
    path_save = Path(f"../data/fcgr-test/{KMER}mer/test-clade_{clade}")
    path_save.mkdir(exist_ok=True, parents=True)

    info_seqs = []
    idx = 0
    for record in tqdm(SeqIO.parse(path_fasta, format="fasta"),total=N, desc=f"clade-{clade}"):
        
        seq = preprocessing(record.seq)
        chaos = fcgr(seq)
        np.save(path_save.joinpath(f"seq_{idx}.npy"), arr=chaos)
        info_seqs.append(Seq(record.id, f"seq_{idx}"))
        idx += 1

    pd.DataFrame(info_seqs).to_csv(path_save.parent.joinpath(f"fcgr-test-clade_{clade}.csv"), sep="\t")

clade-G: 100%|██████████| 1000/1000 [00:32<00:00, 31.16it/s]
clade-S: 100%|██████████| 1000/1000 [00:49<00:00, 20.28it/s]
clade-O: 100%|██████████| 1000/1000 [00:15<00:00, 65.11it/s]
