In [None]:
import pandas as pd

In [None]:
### To change the chromosome annotation in reference genome to canonical annotation
#use readfq (https://github.com/lh3/readfq) to parse fastq, creates function readfq

def readfq(fp): # this is a generator function
    last = None # this is a buffer keeping the last unprocessed line
    while True: # mimic closure; is it a bad idea?
        if not last: # the first record or a record following a fastq
            for l in fp: # search for the start of the next record
                if l[0] in '>@': # fasta/q header line
                    last = l[:-1] # save this line
                    break
        if not last: break
        name, seqs, last = last[1:].partition(" ")[0], [], None
        for l in fp: # read the sequence
            if l[0] in '@+>':
                last = l[:-1]
                break
            seqs.append(l[:-1])
        if not last or last[0] != '+': # this is a fasta record
            yield name, ''.join(seqs), None # yield a fasta record
            if not last: break
        else: # this is a fastq record
            seq, leng, seqs = ''.join(seqs), 0, []
            for l in fp: # read the quality
                seqs.append(l[:-1])
                leng += len(l) - 1
                if leng >= len(seq): # have read enough quality
                    last = None
                    yield name, seq, ''.join(seqs); # yield a fastq record
                    break
            if last: # reach EOF before reading enough quality
                yield name, seq, None # yield a fasta record instead
                break

In [None]:
#create a name_map
name_map = {
    "NC_000001.11": "chr1",
"NC_000002.12": "chr2",
"NC_000003.12": "chr3",
"NC_000004.12": "chr4",
"NC_000005.10": "chr5",
"NC_000006.12": "chr6",
"NC_000007.14": "chr7",
"NC_000008.11": "chr8",
"NC_000009.12": "chr9",
"NC_000010.11": "chr10",
"NC_000011.10": "chr11",
"NC_000012.12": "chr12",
"NC_000013.11": "chr13",
"NC_000014.9": "chr14",
"NC_000015.10": "chr15",
"NC_000016.10": "chr16",
"NC_000017.11": "chr17",
"NC_000018.10": "chr18",
"NC_000019.10": "chr19",
"NC_000020.11": "chr20",
"NC_000021.9": "chr21",
"NC_000022.11": "chr22",
"NC_000023.11": "chrX",
"NC_000024.10": "chrY",
"NC_012920.1": "chrM"}

In [None]:
#use the function readfq with for loop to make changes
with open("/home/mbxha18/data/GENOME/GRCh38.p14_canonical.fna", "w") as new_file:
    with open("/home/mbxha18/data/GENOME/GCF_000001405.40_GRCh38.p14_genomic.fna") as REF:
        for read_id, seq, _ in readfq(REF):
            if read_id.startswith("NC"):
                print(f">{name_map[read_id]}\n{seq}", file=new_file)
                print(read_id, len(seq))

In [None]:
#confirm the changes made
with open("/home/mbxha18/data/GENOME/GRCh38.p14_canonical.fna") as new_REF:
    for read_id, seq, _ in readfq(new_REF):
        print(read_id, len(seq))