In [1]:
import os
import gzip
import requests
from collections import namedtuple

# --- CONFIG ---
genome_fasta_url = "http://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
genome_gff3_url = "http://ftp.ensembl.org/pub/release-111/gff3/homo_sapiens/Homo_sapiens.GRCh38.111.gff3.gz"
output_dir = "./genome_data"
max_proximity_bp = 1000  # max distance between genes to count as "close"

Gene = namedtuple("Gene", ["chrom", "start", "end", "strand", "name"])

# --- DOWNLOAD FILES ---
def download_file(url, dest_path):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(dest_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded {dest_path}")
    else:
        print(f"Failed to download {url} - status code {response.status_code}")

os.makedirs(output_dir, exist_ok=True)
fasta_path = os.path.join(output_dir, os.path.basename(genome_fasta_url))
gff3_path = os.path.join(output_dir, os.path.basename(genome_gff3_url))

if not os.path.exists(fasta_path):
    download_file(genome_fasta_url, fasta_path)
if not os.path.exists(gff3_path):
    download_file(genome_gff3_url, gff3_path)

# --- PARSE GFF3 ---
def parse_gff3_genes(gff3_file):
    genes = []
    with gzip.open(gff3_file, 'rt') as f:
        for line in f:
            if line.startswith("#"):
                continue
            fields = line.strip().split("\t")
            if len(fields) < 9:
                continue
            chrom, source, feature, start, end, score, strand, phase, attributes = fields
            if feature != "gene":
                continue
            attr_dict = {kv.split("=")[0]: kv.split("=")[1] for kv in attributes.split(";") if "=" in kv}
            gene_name = attr_dict.get("Name", "unknown")
            genes.append(Gene(chrom, int(start), int(end), strand, gene_name))
    return genes

genes = parse_gff3_genes(gff3_path)
print(f"Parsed {len(genes)} genes.")

# --- FIND CLOSE OPPOSITE GENE PAIRS ---
def find_facing_gene_pairs(genes, max_distance):
    genes_by_chrom = {}
    for gene in genes:
        genes_by_chrom.setdefault(gene.chrom, []).append(gene)

    close_pairs = []
    for chrom, chrom_genes in genes_by_chrom.items():
        sorted_genes = sorted(chrom_genes, key=lambda g: g.start)
        for i in range(len(sorted_genes) - 1):
            g1 = sorted_genes[i]
            g2 = sorted_genes[i + 1]
            distance = g2.start - g1.end
            if 0 <= distance <= max_distance:
                # Check if they face each other
                if g1.strand == '+' and g2.strand == '-':
                    close_pairs.append((g1, g2))
                elif g1.strand == '-' and g2.strand == '+':
                    close_pairs.append((g2, g1))
    return close_pairs

facing_pairs = find_facing_gene_pairs(genes, max_proximity_bp)
print(f"Found {len(facing_pairs)} facing gene pairs within {max_proximity_bp} bp.")

# --- OUTPUT RESULTS ---
for g1, g2 in facing_pairs:
    print(f"{g1.name} ({g1.chrom}:{g1.start}-{g1.end} {g1.strand}) ↔ {g2.name} ({g2.chrom}:{g2.start}-{g2.end} {g2.strand})")


Downloaded ./genome_data\Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
Downloaded ./genome_data\Homo_sapiens.GRCh38.111.gff3.gz
Parsed 21557 genes.
Found 1221 facing gene pairs within 1000 bp.
ISG15 (1:1001138-1014540 +) ↔ HES4 (1:998962-1000172 -)
B3GALT6 (1:1232237-1235041 +) ↔ SDF4 (1:1216931-1232031 -)
SCNN1D (1:1280436-1292029 +) ↔ ACAP3 (1:1292390-1309609 -)
CPTP (1:1324756-1328896 +) ↔ INTS11 (1:1311585-1324687 -)
ATAD3A (1:1512162-1534685 +) ↔ TMEM240 (1:1534778-1540624 -)
CALML6 (1:1915108-1917296 +) ↔ TMEM52 (1:1917590-1919279 -)
RER1 (1:2391775-2405442 +) ↔ MORN1 (1:2321253-2391707 -)
TPRG1L (1:3625015-3630127 +) ↔ WRAP73 (1:3630767-3652761 -)
KCNAB2 (1:5990927-6101193 +) ↔ CHD5 (1:6101787-6180321 -)
ESPN (1:6424776-6461367 +) ↔ HES2 (1:6412418-6424670 -)
TAS1R1 (1:6555307-6579755 +) ↔ NOL9 (1:6521347-6554513 -)
NMNAT1 (1:9943428-9985501 +) ↔ LZIC (1:9922113-9943407 -)
DNAJC16 (1:15526813-15592379 +) ↔ CASP9 (1:15490832-15526534 -)
CLCNKB (1:16040252-16057311 +) ↔ FAM131C (