In [None]:
import pandas as pd

annotation_df = pd.read_pickle("annotation_df_with_introns.pkl")
chr_seqid = [
    "NC_058066.1", "NC_058067.1", "NC_058068.1", "NC_058069.1", "NC_058070.1",
    "NC_058071.1", "NC_058072.1", "NC_058073.1", "NC_058074.1", "NC_058075.1",
    "NC_058076.1", "NC_058077.1", "NC_058078.1", "NC_058079.1"
]

def create_bed_files(df):
    # EXONS
    exons = df[df['type'] == 'exon'][['seqid', 'start', 'end', 'strand']].copy()
    exons['start'] = exons['start'] - 1
    exons['name'] = '.'
    exons['score'] = '0'
    exons = exons[['seqid', 'start', 'end', 'name', 'score', 'strand']]
    exons.to_csv('exons.bed', sep='\t', header=False, index=False)

    # INTRONS
    introns = df[df['type'] == 'intron'][['seqid', 'start', 'end', 'strand']].copy()
    introns['start'] = introns['start'] - 1
    introns['name'] = '.'
    introns['score'] = '0'
    introns = introns[['seqid', 'start', 'end', 'name', 'score', 'strand']]
    introns.to_csv('introns.bed', sep='\t', header=False, index=False)

    # GENES
    genes = df[df['type'] == 'gene'][['seqid', 'start', 'end', 'strand']].copy()
    genes['start'] = genes['start'] - 1
    genes['name'] = '.'
    genes['score'] = '0'
    genes = genes[['seqid', 'start', 'end', 'name', 'score', 'strand']]
    genes.to_csv('genes.bed', sep='\t', header=False, index=False)

    # PROMOTERS (1000bp upstream from TSS)
    promoters = []
    for _, row in genes.iterrows():
        chrom, start, end, _, _, strand = row
        if strand == '+':
            prom_start = max(0, start - 1000)
            prom_end = start
        else:
            prom_start = end
            prom_end = end + 1000
        promoters.append([chrom, prom_start, prom_end, '.', '0', strand])
    pd.DataFrame(promoters).to_csv('promoters.bed', sep='\t', header=False, index=False)

    # DOWNSTREAM (200bp from gene end)
    downstream = []
    for _, row in genes.iterrows():
        chrom, start, end, _, _, strand = row
        if strand == '+':
            down_start = end
            down_end = end + 200
        else:
            down_start = max(0, start - 200)
            down_end = start
        downstream.append([chrom, down_start, down_end, '.', '0', strand])
    pd.DataFrame(downstream).to_csv('downstream.bed', sep='\t', header=False, index=False)

    # INTERGENIC REGIONS
    intergenic = []
    for chrom in chr_seqid:
        chrom_genes = genes[genes['seqid'] == chrom].sort_values('start')
        prev_end = 0
        for _, gene in chrom_genes.iterrows():
            gene_start = gene['start']
            if gene_start > prev_end:
                intergenic.append([chrom, prev_end, gene_start, '.', '0', '.'])
            prev_end = gene['end']
    pd.DataFrame(intergenic).to_csv('intergenic.bed', sep='\t', header=False, index=False)

create_bed_files(annotation_df)


In [None]:

annotation_df = pd.read_pickle("annotation_df_with_introns.pkl")
annotation_df['start_bed'] = annotation_df['start'] - 1
annotation_df['end_bed'] = annotation_df['end']

def region_length(df, region_type):
    sub = df[df['type'] == region_type]
    return (sub['end_bed'] - sub['start_bed']).sum()

lengths = {}
for region in ['exon', 'intron', 'gene']:
    lengths[region] = region_length(annotation_df, region)

genes_df = annotation_df[annotation_df['type'] == 'gene'].copy()
genes_df['start_bed'] = genes_df['start'] - 1
promoters_total = genes_df.shape[0] * 1000
lengths['promoter'] = promoters_total

downstream_total = genes_df.shape[0] * 200
lengths['downstream'] = downstream_total

chr_seqid = [
    "NC_058066.1", "NC_058067.1", "NC_058068.1", "NC_058069.1", "NC_058070.1",
    "NC_058071.1", "NC_058072.1", "NC_058073.1", "NC_058074.1", "NC_058075.1",
    "NC_058076.1", "NC_058077.1", "NC_058078.1", "NC_058079.1"
]

intergenic_total = 0
for chrom in chr_seqid:
    chrom_genes = genes_df[genes_df['seqid'] == chrom].sort_values('start_bed')
    prev_end = 0
    for _, row in chrom_genes.iterrows():
        gene_start = row['start_bed']
        if gene_start > prev_end:
            intergenic_total += gene_start - prev_end
        prev_end = max(prev_end, row['end_bed'])  # на случай наложений
lengths['intergenic'] = intergenic_total

total = sum(lengths.values())

fractions = {region: round(length / total * 100, 2) for region, length in lengths.items()}

print(" Длины участков (в bp):")
for k, v in lengths.items():
    print(f"{k:<12}: {v}")

print("\n Доли от всех аннотированных участков (%):")
for k, v in fractions.items():
    print(f"{k:<12}: {v}")


 Длины участков (в bp):
exon        : 94647500
intron      : 159579732
gene        : 223943420
promoter    : 27201000
downstream  : 5440200
intergenic  : 129665390

 Доли от всех аннотированных участков (%):
exon        : 14.78
intron      : 24.92
gene        : 34.97
promoter    : 4.25
downstream  : 0.85
intergenic  : 20.25


### Пересечение Zhunt

In [None]:
!bedtools intersect -a zdna_gt400.bed -b exons.bed -wo > exon_intersections.bed
!bedtools intersect -a zdna_gt400.bed -b introns.bed -wo > intron_intersections.bed
!bedtools intersect -a zdna_gt400.bed -b promoters.bed -wo > promoter_intersections.bed
!bedtools intersect -a zdna_gt400.bed -b intergenic.bed -wo > intergenic_intersections.bed
!bedtools intersect -a zdna_gt400.bed -b genes.bed -wo > gene_intersections.bed
!bedtools intersect -a zdna_gt400.bed -b downstream.bed -wo > downstream_intersections.bed


In [None]:
def analyze_intersections():
    results = []
    total_quad = sum(1 for _ in open('zdna_gt400.bed'))

    regions = ['exon', 'intron', 'promoter', 'intergenic', 'gene', 'downstream']
    for region in regions:
        try:
            df = pd.read_csv(f'{region}_intersections.bed', sep='\t', header=None)
            count = len(df)
        except:
            count = 0
        results.append({
            'Участок': region,
            'Число пересечений': count,
            'Доля (%)': round(count / total_quad * 100, 2)
        })

    return pd.DataFrame(results)

results_df = analyze_intersections()
print(results_df)


      Участок  Число пересечений  Доля (%)
0        exon             183407     16.09
1      intron             391056     34.31
2    promoter              78622      6.90
3  intergenic             413183     36.25
4        gene             505085     44.31
5  downstream              14853      1.30


### Пересечение zdnabert

In [None]:
with open("zdnabert_results.txt") as f:
    lines = f.readlines()

bed_lines = []
chrom = ""
for line in lines:
    line = line.strip()
    if line.startswith(">"):
        chrom = line[1:]
    elif line.lower().startswith("start"):
        continue
    elif line:
        start_str, end_str = line.split()
        start = max(0, int(start_str) - 1)
        end = int(end_str)
        bed_lines.append(f"{chrom}\t{start}\t{end}")

with open("zdnabert.bed", "w") as out:
    out.write("\n".join(bed_lines))

print("Файл 'zdnabert.bed' успешно создан (start >= 0).")


Файл 'zdnabert.bed' успешно создан (start >= 0).


In [None]:
! head zdnabert.bed

NC_058066.1	322	337
NC_058066.1	23302	23311
NC_058066.1	25313	25332
NC_058066.1	25436	25454
NC_058066.1	26102	26118
NC_058066.1	29330	29347
NC_058066.1	30678	30692
NC_058066.1	30781	30799
NC_058066.1	31561	31572
NC_058066.1	41648	41661


In [None]:
!bedtools intersect -a zdnabert.bed -b exons.bed -u > zdnabert_exons.bed
!bedtools intersect -a zdnabert.bed -b introns.bed -u > zdnabert_introns.bed
!bedtools intersect -a zdnabert.bed -b promoters.bed -u > zdnabert_promoters.bed
!bedtools intersect -a zdnabert.bed -b downstream.bed -u > zdnabert_downstream.bed
!bedtools intersect -a zdnabert.bed -b intergenic.bed -u > zdnabert_intergenic.bed
!bedtools intersect -a zdnabert.bed -b genes.bed -u > zdnabert_genes.bed


In [None]:
! head zdnabert_exons.bed

NC_058066.1	29330	29347
NC_058066.1	46088	46100
NC_058066.1	46683	46698
NC_058066.1	46874	46890
NC_058066.1	47152	47171
NC_058066.1	49858	49874
NC_058066.1	69172	69189
NC_058066.1	189913	189929
NC_058066.1	517621	517646
NC_058066.1	632266	632290


In [None]:

total = sum(1 for _ in open("zdnabert.bed"))
regions = ["exons", "introns", "promoters", "downstream", "intergenic", "genes"]

def count_hits(region):
    try:
        with open(f"zdnabert_{region}.bed") as f:
            return sum(1 for _ in f)
    except FileNotFoundError:
        return 0

print("| Участок     | Число попаданий | Доля (%) |")
print("|-------------|-----------------|----------|")
for region in regions:
    count = count_hits(region)
    percent = round(count / total * 100, 2) if total > 0 else 0
    print(f"| {region:<11} | {count:<15} | {percent:<8} |")


| Участок     | Число попаданий | Доля (%) |
|-------------|-----------------|----------|
| exons       | 9172            | 10.89    |
| introns     | 26359           | 31.29    |
| promoters   | 5577            | 6.62     |
| downstream  | 1116            | 1.32     |
| intergenic  | 30777           | 36.53    |
| genes       | 34674           | 41.16    |


### Пересечение квадруплексы

In [None]:
!bedtools intersect -a quadruplexes.bed -b exons.bed -u > g4_exons.bed
!bedtools intersect -a quadruplexes.bed -b introns.bed -u > g4_introns.bed
!bedtools intersect -a quadruplexes.bed -b promoters.bed -u > g4_promoters.bed
!bedtools intersect -a quadruplexes.bed -b downstream.bed -u > g4_downstream.bed
!bedtools intersect -a quadruplexes.bed -b intergenic.bed -u > g4_intergenic.bed
!bedtools intersect -a quadruplexes.bed -b genes.bed -u > g4_genes.bed


In [None]:
total_g4 = sum(1 for _ in open("quadruplexes.bed"))

regions = ["exons", "introns", "promoters", "downstream", "intergenic", "genes"]

def count_g4_hits(region):
    try:
        with open(f"g4_{region}.bed") as f:
            return sum(1 for _ in f)
    except FileNotFoundError:
        return 0

print("| Участок     | Число квадруплексов | Доля (%) |")
print("|-------------|----------------------|----------|")
for region in regions:
    count = count_g4_hits(region)
    percent = round(count / total_g4 * 100, 2) if total_g4 > 0 else 0
    print(f"| {region:<11} | {count:<22} | {percent:<8} |")


| Участок     | Число квадруплексов | Доля (%) |
|-------------|----------------------|----------|
| exons       | 912                    | 5.09     |
| introns     | 7077                   | 39.46    |
| promoters   | 1217                   | 6.79     |
| downstream  | 320                    | 1.78     |
| intergenic  | 6067                   | 33.83    |
| genes       | 7872                   | 43.89    |


## 2 таблица

In [None]:
!bedtools intersect -a promoters.bed -b quadruplexes.bed -c > promoters_g4_counts.bed
!bedtools intersect -a promoters.bed -b zdnabert.bed -c > promoters_bert_counts.bed


In [None]:
g4_df = pd.read_csv("promoters_g4_counts.bed", sep="\t", header=None)
bert_df = pd.read_csv("promoters_bert_counts.bed", sep="\t", header=None)

total_promoters = len(g4_df)
with_g4 = (g4_df.iloc[:, -1] > 0).sum()
g4_percent = round(with_g4 / total_promoters * 100, 2)

with_bert = (bert_df.iloc[:, -1] > 0).sum()
bert_percent = round(with_bert / total_promoters * 100, 2)

print("### Промотеры")
print(f"Всего участков: {total_promoters}")
print(f"С квадруплексом: {with_g4} ({g4_percent}%)")
print(f"С ZDNABERT: {with_bert} ({bert_percent}%)")


### Промотеры
Всего участков: 27201
С квадруплексом: 1167 (4.29%)
С ZDNABERT: 4889 (17.97%)


In [None]:
# EXONS
!bedtools intersect -a exons.bed -b quadruplexes.bed -c > exons_g4_counts.bed
!bedtools intersect -a exons.bed -b zdnabert.bed -c > exons_bert_counts.bed

# INTRONS
!bedtools intersect -a introns.bed -b quadruplexes.bed -c > introns_g4_counts.bed
!bedtools intersect -a introns.bed -b zdnabert.bed -c > introns_bert_counts.bed

# DOWNSTREAM
!bedtools intersect -a downstream.bed -b quadruplexes.bed -c > downstream_g4_counts.bed
!bedtools intersect -a downstream.bed -b zdnabert.bed -c > downstream_bert_counts.bed

# INTERGENIC
!bedtools intersect -a intergenic.bed -b quadruplexes.bed -c > intergenic_g4_counts.bed
!bedtools intersect -a intergenic.bed -b zdnabert.bed -c > intergenic_bert_counts.bed


In [None]:
def count_regions_with_hits(bed_file):
    df = pd.read_csv(bed_file, sep="\t", header=None)
    total = len(df)
    with_hit = (df.iloc[:, -1] > 0).sum()
    percent = round(with_hit / total * 100, 2)
    return total, with_hit, percent

regions = ['exons', 'introns', 'downstream', 'intergenic']
result_rows = []

for region in regions:
    row = {'Участок': region.capitalize()}

    total, g4_hits, g4_percent = count_regions_with_hits(f"{region}_g4_counts.bed")
    _, bert_hits, bert_percent = count_regions_with_hits(f"{region}_bert_counts.bed")

    row['Всего участков'] = total
    row['С квадруплексом'] = g4_hits
    row['Доля с квадруплексом (%)'] = g4_percent
    row['С ZDNABERT'] = bert_hits
    row['Доля с ZDNABERT (%)'] = bert_percent

    result_rows.append(row)

result_rows.insert(0, {
    'Участок': 'Promoters (1000 up TSS)',
    'Всего участков': 27201,
    'С квадруплексом': 1167,
    'Доля с квадруплексом (%)': 4.29,
    'С ZDNABERT': 4889,
    'Доля с ZDNABERT (%)': 17.97
})

df = pd.DataFrame(result_rows)
print(df.to_markdown(index=False))


| Участок                 |   Всего участков |   С квадруплексом |   Доля с квадруплексом (%) |   С ZDNABERT |   Доля с ZDNABERT (%) |
|:------------------------|-----------------:|------------------:|---------------------------:|-------------:|----------------------:|
| Promoters (1000 up TSS) |            27201 |              1167 |                       4.29 |         4889 |                 17.97 |
| Exons                   |           324069 |              1397 |                       0.43 |        11627 |                  3.59 |
| Introns                 |           171057 |              6524 |                       3.81 |        18876 |                 11.03 |
| Downstream              |            27201 |               320 |                       1.18 |         1043 |                  3.83 |
| Intergenic              |            23424 |              4049 |                      17.29 |        10934 |                 46.68 |


In [None]:
# EXONS
!bedtools intersect -a exons.bed -b zdna_gt400.bed -c > exons_zhunt_counts.bed

# INTRONS
!bedtools intersect -a introns.bed -b zdna_gt400.bed -c > introns_zhunt_counts.bed

# DOWNSTREAM
!bedtools intersect -a downstream.bed -b zdna_gt400.bed -c > downstream_zhunt_counts.bed

# INTERGENIC
!bedtools intersect -a intergenic.bed -b zdna_gt400.bed -c > intergenic_zhunt_counts.bed


In [None]:
!bedtools intersect -a promoters.bed -b zdna_gt400.bed -c > promoters_zhunt_counts.bed

In [None]:
! head exons_zhunt_counts.bed

NC_058066.1	1961	2119	.	0	-	0
NC_058066.1	15360	15664	.	0	+	0
NC_058066.1	18710	18775	.	0	-	0
NC_058066.1	21000	21093	.	0	-	0
NC_058066.1	22158	22443	.	0	+	0
NC_058066.1	23084	23221	.	0	-	0
NC_058066.1	24770	25067	.	0	+	0
NC_058066.1	26652	26913	.	0	+	0
NC_058066.1	27071	27359	.	0	+	0
NC_058066.1	27658	27952	.	0	+	0


In [None]:
import pandas as pd

def count_regions_with_hits(bed_file):
    df = pd.read_csv(bed_file, sep="\t", header=None)
    total = len(df)
    with_hits = (df.iloc[:, -1] > 0).sum()
    percent = round(with_hits / total * 100, 2)
    return total, with_hits, percent

regions = {
    "Promoters (1000 up TSS)": "promoters_zhunt_counts.bed",
    "Exons": "exons_zhunt_counts.bed",
    "Introns": "introns_zhunt_counts.bed",
    "Downstream": "downstream_zhunt_counts.bed",
    "Intergenic": "intergenic_zhunt_counts.bed"
}

rows = []

for label, file in regions.items():
    total, with_zhunt, zhunt_pct = count_regions_with_hits(file)
    rows.append({
        "Участок": label,
        "Всего участков": total,
        "С Zhun": with_zhunt,
        "Доля с Zhun (%)": zhunt_pct
    })

df = pd.DataFrame(rows)
print(df.to_markdown(index=False))


| Участок                 |   Всего участков |   С Zhun |   Доля с Zhun (%) |
|:------------------------|-----------------:|---------:|------------------:|
| Promoters (1000 up TSS) |            27201 |     7271 |             26.73 |
| Exons                   |           324069 |    19623 |              6.06 |
| Introns                 |           171057 |    30286 |             17.71 |
| Downstream              |            27201 |     1747 |              6.42 |
| Intergenic              |            23424 |    13093 |             55.9  |


## Групповая часть

In [None]:
!bedtools intersect -a promoters.bed -b quadruplexes.bed -wa -wb > promoters_with_g4.bed


In [None]:

cols = ['chr', 'prom_start', 'prom_end', 'name', 'score', 'strand',
        'g4_chr', 'g4_start', 'g4_end', 'g4_strand']
g4_promoters = pd.read_csv("promoters_with_g4.bed", sep="\t", header=None, names=cols)

annotation_df = pd.read_pickle("annotation_df_with_introns.pkl")
genes = annotation_df[annotation_df['type'] == 'gene'].copy()
genes['start_bed'] = genes['start'] - 1

matched = []

for _, row in g4_promoters.iterrows():
    chrom = row['chr']
    strand = row['strand']
    prom_start = row['prom_start']
    prom_end = row['prom_end']
    g4_start = row['g4_start']
    g4_end = row['g4_end']

    if strand == '+':
        gene_hit = genes[
            (genes['seqid'] == chrom) &
            (genes['start_bed'] == prom_end) &
            (genes['strand'] == strand)
        ]
    else:
        gene_hit = genes[
            (genes['seqid'] == chrom) &
            (genes['end'] == prom_start) &
            (genes['strand'] == strand)
        ]

    if not gene_hit.empty:
        gene = gene_hit.iloc[0].copy()
        gene['g4_start'] = g4_start
        gene['g4_end'] = g4_end
        gene['g4_strand'] = strand
        matched.append(gene)

matched_df = pd.DataFrame(matched)
matched_df = matched_df[['seqid', 'start', 'end', 'strand', 'attributes', 'g4_start', 'g4_end']]
matched_df.to_csv("genes_with_g4_in_promoters.tsv", sep="\t", index=False)

print(matched_df.head(5))


            seqid    start      end strand  \
205   NC_058066.1    92732   195229      +   
1433  NC_058066.1   673540   681064      -   
2091  NC_058066.1   769769   771513      +   
3570  NC_058066.1  1234994  1239047      -   
3570  NC_058066.1  1234994  1239047      -   

                                             attributes  g4_start   g4_end  
205   ID=gene-LOC114963509;Dbxref=GeneID:114963509;N...     92480    92509  
1433  ID=gene-LOC114952867;Dbxref=GeneID:114952867;N...    681858   681879  
2091  ID=gene-LOC114952842;Dbxref=GeneID:114952842;N...    768885   768907  
3570  ID=gene-LOC122957806;Dbxref=GeneID:122957806;N...   1239580  1239621  
3570  ID=gene-LOC122957806;Dbxref=GeneID:122957806;N...   1239748  1239769  


In [3]:
!bedtools intersect -a promoters.bed -b zdnabert.bed -wa -wb > promoters_with_zdna.bed


In [4]:
import pandas as pd

cols = ['chr', 'prom_start', 'prom_end', 'name', 'score', 'strand',
        'zdna_chr', 'zdna_start', 'zdna_end']
z_promoters = pd.read_csv("promoters_with_zdna.bed", sep="\t", header=None, names=cols)

annotation_df = pd.read_pickle("annotation_df_with_introns.pkl")
genes = annotation_df[annotation_df['type'] == 'gene'].copy()
genes['start_bed'] = genes['start'] - 1

matched = []

for _, row in z_promoters.iterrows():
    chrom = row['chr']
    strand = row['strand']
    prom_start = row['prom_start']
    prom_end = row['prom_end']
    zdna_start = row['zdna_start']
    zdna_end = row['zdna_end']

    if strand == '+':
        gene_hit = genes[
            (genes['seqid'] == chrom) &
            (genes['start_bed'] == prom_end) &
            (genes['strand'] == strand)
        ]
    else:
        gene_hit = genes[
            (genes['seqid'] == chrom) &
            (genes['end'] == prom_start) &
            (genes['strand'] == strand)
        ]

    if not gene_hit.empty:
        gene = gene_hit.iloc[0].copy()
        gene['zdna_start'] = zdna_start
        gene['zdna_end'] = zdna_end
        gene['zdna_strand'] = strand
        matched.append(gene)

matched_df = pd.DataFrame(matched)
matched_df = matched_df[['seqid', 'start', 'end', 'strand', 'attributes', 'zdna_start', 'zdna_end']]
matched_df.to_csv("genes_with_zdna_in_promoters.tsv", sep="\t", index=False)

print(matched_df.head(5))


           seqid   start     end strand  \
1    NC_058066.1    1962   23221      -   
79   NC_058066.1   48365   50069      -   
87   NC_058066.1   53120   55082      +   
112  NC_058066.1   57152   69127      -   
499  NC_058066.1  321951  383632      +   

                                            attributes  zdna_start  zdna_end  
1    ID=gene-LOC114963522;Dbxref=GeneID:114963522;N...       23302     23311  
79   ID=gene-LOC114963519;Dbxref=GeneID:114963519;N...       50341     50358  
87   ID=gene-LOC114950438;Dbxref=GeneID:114950438;N...       52516     52534  
112  ID=gene-LOC114963510;Dbxref=GeneID:114963510;N...       69172     69189  
499  ID=gene-LOC114952822;Dbxref=GeneID:114952822;N...      321912    321927  


In [6]:

cols = ['chr', 'prom_start', 'prom_end', 'name', 'score', 'strand',
        'zdna_chr', 'zdna_start', 'zdna_end']
z_promoters = pd.read_csv("promoters_with_zdna.bed", sep="\t", header=None, names=cols)

annotation_df = pd.read_pickle("annotation_df_with_introns.pkl")
genes = annotation_df[annotation_df['type'] == 'gene'].copy()
genes['start_bed'] = genes['start'] - 1

matched = []

for _, row in z_promoters.iterrows():
    chrom = row['chr']
    strand = row['strand']
    prom_start = row['prom_start']
    prom_end = row['prom_end']
    zdna_start = row['zdna_start']
    zdna_end = row['zdna_end']

    if strand == '+':
        gene_hit = genes[
            (genes['seqid'] == chrom) &
            (genes['start_bed'] == prom_end) &
            (genes['strand'] == strand)
        ]
    else:
        gene_hit = genes[
            (genes['seqid'] == chrom) &
            (genes['end'] == prom_start) &
            (genes['strand'] == strand)
        ]

    if not gene_hit.empty:
        gene = gene_hit.iloc[0].copy()
        gene['prom_start'] = prom_start
        gene['prom_end'] = prom_end
        gene['zdna_start'] = zdna_start
        gene['zdna_end'] = zdna_end
        gene['zdna_strand'] = strand
        matched.append(gene)

matched_df = pd.DataFrame(matched)
matched_df = matched_df[['seqid', 'start', 'end', 'strand',
                         'prom_start', 'prom_end', 'zdna_start', 'zdna_end', 'attributes']]
matched_df.to_csv("genes_with_zdna_and_promoters.tsv", sep="\t", index=False)

print(matched_df.head())


           seqid   start     end strand  prom_start  prom_end  zdna_start  \
1    NC_058066.1    1962   23221      -       23221     24221       23302   
79   NC_058066.1   48365   50069      -       50069     51069       50341   
87   NC_058066.1   53120   55082      +       52119     53119       52516   
112  NC_058066.1   57152   69127      -       69127     70127       69172   
499  NC_058066.1  321951  383632      +      320950    321950      321912   

     zdna_end                                         attributes  
1       23311  ID=gene-LOC114963522;Dbxref=GeneID:114963522;N...  
79      50358  ID=gene-LOC114963519;Dbxref=GeneID:114963519;N...  
87      52534  ID=gene-LOC114950438;Dbxref=GeneID:114950438;N...  
112     69189  ID=gene-LOC114963510;Dbxref=GeneID:114963510;N...  
499    321927  ID=gene-LOC114952822;Dbxref=GeneID:114952822;N...  
