In [67]:
import requests
import xml.etree.ElementTree as ET
import random

In [68]:
# Минимум и максимум длины каждой случайной последовательности.
sequence_min_length = 300
sequence_max_length = 1000
# Отсюда будем брать последовательности.
genome_url_format = "https://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment=chr{chromosome}:{begin},{end}"

In [69]:
# Длины хромосом.
chromosome_lengths = {"1": 248956422,
                      "2": 242193529,
                      "3": 198295559,
                      "4": 190214555,
                      "5": 181538259,
                      "6": 170805979,
                      "7": 159345973,
                      "8": 145138636,
                      "9": 138394717,
                      "10": 133797422,
                      "11": 135086622,
                      "12": 133275309,
                      "13": 114364328,
                      "14": 107043718,
                      "15": 101991189,
                      "16": 90338345,
                      "17": 83257441,
                      "18": 80373285,
                      "19": 58617616,
                      "20": 64444167,
                      "21": 46709983,
                      "22": 50818468,
                      "X": 156040895,
                      "Y": 57227415}

In [70]:
# Получает случайный регион в геноме.
def get_random_genome_region():
    chromosome = random.choice(list(chromosome_lengths.keys()))
    sequence_length = random.randint(sequence_min_length, sequence_max_length)
    begin = random.randint(1, chromosome_lengths[chromosome] - sequence_length)
    end = begin + sequence_length

    return chromosome, begin, end

In [75]:
# Получает последовательность из генома.
def get_genome_sequence(chromosome: str, begin: int, end: int) -> str:    
    url = genome_url_format.format(chromosome=chromosome, begin=begin, end=end)
    r = requests.get(url)
    if r.status_code != 200:
        assert False, "Failed to open genome url"

    xml_root = ET.fromstring(r.text)
    sequence = xml_root[0][0]
    
    return sequence.text

In [76]:
sequence_header_format = ">chr={chromosome}|begin={begin}|end={end} "

In [106]:
def main(output_file):
    with open(output_file, "w") as f:
        for i in range(100):
            print(f"Getting sequence {i + 1}...")
            
            while True:
                chromosome, begin, end = get_random_genome_region()
                print(chromosome, begin, end)
                sequence = get_genome_sequence(chromosome, begin, end).upper()

                # Проверяем, чтобы было только A/T/G/C.
                restart = False
                for ch in sequence:
                    if ch not in "ATGC\n":
                        restart = True
                        break
                if restart:
                    continue                

                break
            
            f.writelines([sequence_header_format.format(chromosome=chromosome, begin=begin, end=end),
                          sequence])

In [107]:
main("./files/human-genome-blast-output.fasta")

Getting sequence 1...
3 162392568 162393050
Getting sequence 2...
1 164222808 164223163
Getting sequence 3...
2 169108457 169109258
Getting sequence 4...
8 15505082 15505606
Getting sequence 5...
5 55682123 55683043
Getting sequence 6...
19 52361892 52362351
Getting sequence 7...
15 93753250 93754241
Getting sequence 8...
21 24681316 24681665
Getting sequence 9...
7 128249309 128249654
Getting sequence 10...
2 134079339 134080222
Getting sequence 11...
22 3015213 3015794
7 139258118 139259007
Getting sequence 12...
20 55488124 55488960
Getting sequence 13...
12 107140945 107141274
Getting sequence 14...
11 57556181 57557064
Getting sequence 15...
10 103493610 103494557
Getting sequence 16...
14 75980707 75981273
Getting sequence 17...
15 67058012 67058582
Getting sequence 18...
12 53857050 53857900
Getting sequence 19...
Y 23098063 23098647
Getting sequence 20...
13 50929594 50930001
Getting sequence 21...
16 56666769 56667664
Getting sequence 22...
14 85325376 85325692
Getting sequenc