In [2]:
import requests
import pandas as pd
import time

# Define the file path to your SNP data
snp_file = "Filtered_SNP_Data.csv"  # Make sure this file contains CHROM, POS, SNP_ID
output_file = "SNP_with_Genes.csv"

# Load SNP data
snp_df = pd.read_csv(snp_file)

# Function to check which genome assembly Ensembl is using
def check_ensembl_assembly():
    url = "https://rest.ensembl.org/info/assembly/human?content-type=application/json"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            data = response.json()
            print(f"✅ Ensembl is using genome assembly: {data['assembly_name']}")
            return data['assembly_name']
        else:
            print(f"⚠️ Ensembl API returned status: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"❌ Error: Unable to connect to Ensembl API - {e}")

# Function to convert GRCh37 coordinates to GRCh38
def convert_grch37_to_grch38(chrom, pos):
    url = f"https://rest.ensembl.org/map/human/GRCh37/{chrom}:{pos}:{pos}/GRCh38?content-type=application/json"
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if 'mappings' in data and data['mappings']:
                return data['mappings'][0]['mapped']['start']  # New GRCh38 position
    except requests.exceptions.RequestException:
        return None
    return None

# Function to query Ensembl API for the closest gene
def get_closest_gene(chrom, pos, use_grch37=False):
    base_url = "https://grch37.rest.ensembl.org" if use_grch37 else "https://rest.ensembl.org"
    url = f"{base_url}/overlap/region/human/{chrom}:{pos}-{pos}?feature=gene;content-type=application/json"

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data:
                return data[0].get('external_name', 'No_Gene_Found')  # Return gene name if found
    except requests.exceptions.RequestException:
        return "API_Error"

    return "No_Gene_Found"

# Check if Ensembl is using GRCh37 or GRCh38
ensembl_assembly = check_ensembl_assembly()

# If SNPs are from GRCh37, convert them to GRCh38
if ensembl_assembly == "GRCh38":
    print("🔄 Converting GRCh37 SNP positions to GRCh38...")
    snp_df["GRCh38_POS"] = snp_df.apply(lambda x: convert_grch37_to_grch38(x["CHROM"], x["POS"]) 
                                        if pd.notna(x["POS"]) else None, axis=1)
    # Use new GRCh38 positions for annotation
    snp_df["Final_POS"] = snp_df["GRCh38_POS"].fillna(snp_df["POS"])  # Keep original POS if no conversion
else:
    print("🛑 Using GRCh37 for annotation (no conversion).")
    snp_df["Final_POS"] = snp_df["POS"]

# Apply the function to annotate genes
print("🔍 Annotating SNPs with closest genes...")
snp_df["associated_gene"] = snp_df.apply(lambda x: get_closest_gene(x["CHROM"], x["Final_POS"], 
                                                                    use_grch37=(ensembl_assembly != "GRCh38")), axis=1)
time.sleep(1)  # Adding delay to prevent API rate limits

# Save results
snp_df.to_csv(output_file, index=False)

print(f"✅ Results saved to {output_file}")


✅ Ensembl is using genome assembly: GRCh38.p14
🛑 Using GRCh37 for annotation (no conversion).
🔍 Annotating SNPs with closest genes...
✅ Results saved to SNP_with_Genes.csv
