<a href="https://colab.research.google.com/github/jseguiguren/GenomeMining/blob/main/chromosome_end_genes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests

# Define the Ensembl REST API endpoint and species
ensembl_endpoint = "https://rest.ensembl.org"
species = "danio_rerio"  # Zebrafish species

# Fetch chromosome information from Ensembl
chromosomes_url = f"{ensembl_endpoint}/info/assembly/{species}"
chromosomes_response = requests.get(chromosomes_url, headers={"Content-Type": "application/json"})

if chromosomes_response.status_code == 200:
    chromosomes_data = chromosomes_response.json()
    chromosomes = chromosomes_data['top_level_region']
else:
    print(f"Error fetching chromosome information: {chromosomes_response.status_code}")
    chromosomes = []

# Define telomeric regions (both sides of each chromosome)
telomeric_regions = []

for chromosome in chromosomes:
    if chromosome['name'].startswith("chrUn"):  # Skip 'chrUn' chromosomes
        continue

    telomeric_regions.append(
        {"chr": chromosome['name'], "start": 1, "end": 100000}
    )
    telomeric_regions.append(
        {"chr": chromosome['name'], "start": chromosome['length'] - 100000, "end": chromosome['length']}
    )

# Define the distance threshold to consider genes as "close" to telomeres
proximity_threshold = 10000  # Example threshold distance

# Fetch gene information from Ensembl for each telomeric region
genes_near_telomeres = []

for region in telomeric_regions:
    region_str = f"{region['chr']}:{region['start']}-{region['end']}"
    url = f"{ensembl_endpoint}/overlap/region/{species}/{region_str}?feature=gene"

    response = requests.get(url, headers={"Content-Type": "application/json"})

    if response.status_code == 200:
        gene_data = response.json()
        for gene in gene_data:
            if gene['biotype'] == 'protein_coding' and 'external_name' in gene:
                gene_id = gene['id']
                external_gene_name = gene['external_name']
                chromosome_number = region['chr']
                gene_start = gene['start']
                gene_end = gene['end']
                distance_to_telomere = min(
                    abs(gene_start - region['start']),
                    abs(gene_end - region['start']),
                    abs(gene_start - region['end']),
                    abs(gene_end - region['end'])
                )
                if distance_to_telomere <= proximity_threshold:
                    gene_location = "Start" if abs(gene_start - region['start']) < abs(gene_end - region['end']) else "End"
                    genes_near_telomeres.append((chromosome_number, gene_id, external_gene_name, gene_location))
    else:
        print(f"Error fetching data for region {region_str}: {response.status_code}")

# Print the list of protein-coding genes near telomeric regions with external gene names and location
print("Protein-coding genes near telomeric regions with external gene names and location:")
for chromosome, gene_id, external_gene_name, gene_location in genes_near_telomeres:
    print(f"Chromosome {chromosome}: Gene ID {gene_id}, External Gene Name {external_gene_name}, Location {gene_location}")


In [None]:
import csv

# Write the list of protein-coding genes near telomeric regions with external gene names and location to a CSV file
csv_file = "genes_near_telomeres.csv"

with open(csv_file, "w", newline="") as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(["Chromosome", "Gene ID", "External Gene Name", "Location"])
    for chromosome, gene_id, external_gene_name, gene_location in genes_near_telomeres:
        csv_writer.writerow([chromosome, gene_id, external_gene_name, gene_location])

print(f"Data exported to {csv_file}")

Data exported to genes_near_telomeres.csv


In [None]:
pwd

'/content'