In [None]:
#### Compile a dictionary of molecular biology, medical, species names, gene names, etc to be used in the search parameters of the addgene website
#### Note: for stuff like this, store in the 0.local folder so doesnt gunk up github (or use gitignore)

## Installs
# in terminal:
# pip install pybiomart
# for progress bar:
# pip install tqdm

In [None]:
#### Imports
from pybiomart import Server
import requests
import gzip
import shutil
import re
import os
from tqdm import tqdm
from pathlib import Path

In [None]:
#==== Import species gene symbols from ensembl ====#

## define target species info 
species_info = [
    # Ensembl
    {
        "name": "Homo sapiens",
        "filename": "hsapiens",
        "url": "http://ftp.ensembl.org/pub/release-110/gtf/homo_sapiens/Homo_sapiens.GRCh38.110.gtf.gz"
    },
    {
        "name": "Mus musculus",
        "filename": "mmusculus",
        "url": "http://ftp.ensembl.org/pub/release-110/gtf/mus_musculus/Mus_musculus.GRCm39.110.gtf.gz"
    },
    {
        "name": "Rattus norvegicus",
        "filename": "rnorvegicus",
        "url": "http://ftp.ensembl.org/pub/release-110/gtf/rattus_norvegicus/Rattus_norvegicus.mRatBN7.2.110.gtf.gz"
    },
    {
        "name": "Danio rerio",
        "filename": "drerio",
        "url": "http://ftp.ensembl.org/pub/release-110/gtf/danio_rerio/Danio_rerio.GRCz11.110.gtf.gz"
    },
    {
        "name": "Xenopus tropicalis",
        "filename": "xtropicalis",
        "url": "http://ftp.ensembl.org/pub/release-110/gtf/xenopus_tropicalis/Xenopus_tropicalis.UCB_Xtro_10.0.110.gtf.gz"
    },

    # Ensembl Metazoa
    {
        "name": "Caenorhabditis elegans",
        "filename": "celegans",
        "url": "http://ftp.ensemblgenomes.org/pub/metazoa/release-60/gtf/caenorhabditis_elegans/Caenorhabditis_elegans.WBcel235.60.gtf.gz"
    },
    {
        "name": "Drosophila melanogaster",
        "filename": "dmelanogaster",
        "url": "http://ftp.ensemblgenomes.org/pub/metazoa/release-60/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.60.gtf.gz"
    },

    # Ensembl Fungi
    {
        "name": "Saccharomyces cerevisiae",
        "filename": "scerevisiae",
        "url": "http://ftp.ensemblgenomes.org/pub/fungi/release-60/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.60.gtf.gz"
    },
    {
        "name": "Schizosaccharomyces pombe",
        "filename": "spombe",
        "url": "http://ftp.ensemblgenomes.org/pub/fungi/release-60/gtf/schizosaccharomyces_pombe/Schizosaccharomyces_pombe.ASM294v2.60.gtf.gz"
    },

    # Ensembl Plants
    {
        "name": "Arabidopsis thaliana",
        "filename": "athaliana",
        "url": "http://ftp.ensemblgenomes.org/pub/plants/release-60/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.60.gtf.gz"
    }
]

def download_and_extract_gene_symbols(info):
    # specify output directory
    out_dir = "../0.local/scrape-addgene/gene-symbols/"
    os.makedirs(out_dir, exist_ok=True)

    #
    gz_file = f"{info['filename']}.gtf.gz"
    gtf_file = f"{info['filename']}.gtf"
    out_file = f"{info['filename']}_gene_symbols.txt"
    #
    gz_file = out_dir + gz_file
    gtf_file = out_dir + gtf_file
    out_file = out_dir + out_file

    #
    print(f"\n⬇️ Downloading {info['name']} GTF...")
    with requests.get(info['url'], stream=True) as r:
        r.raise_for_status()
        total_size = int(r.headers.get('content-length', 0))
        chunk_size = 8192
        with open(gz_file, 'wb') as f, tqdm(
            total=total_size, unit='B', unit_scale=True, unit_divisor=1024,
            desc=info['filename'], initial=0
        ) as bar:
            for chunk in r.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
                    bar.update(len(chunk))

    with gzip.open(gz_file, 'rb') as f_in, open(gtf_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    gene_symbols = set()
    with open(gtf_file, 'r') as f:
        for line in f:
            if line.startswith('#') or "\tgene\t" not in line:
                continue
            match = re.search(r'gene_name "([^"]+)"', line)
            if match:
                gene_symbols.add(match.group(1))

    with open(out_file, 'w') as f:
        for gene in sorted(gene_symbols):
            f.write(gene + '\n')

    print(f"✅ Saved {len(gene_symbols)} gene symbols to {out_file}")
    os.remove(gz_file)
    os.remove(gtf_file)

# Run for all species
for sp in species_info:
    download_and_extract_gene_symbols(sp)


In [None]:
#=== E. coli ===#

# === Step 1: Define metadata ===
species = "Escherichia_coli_str_k_12_substr_mg1655"
filename = "ecoli"
url = f"http://ftp.ensemblgenomes.org/pub/bacteria/release-60/gtf/bacteria_0_collection/escherichia_coli_str_k_12_substr_mg1655_gca_000005845/Escherichia_coli_str_k_12_substr_mg1655_gca_000005845.ASM584v2.60.gtf.gz"

# specify output directory
out_dir = "../0.local/scrape-addgene/gene-symbols/"
os.makedirs(out_dir, exist_ok=True)

#
gz_file = f"{info['filename']}.gtf.gz"
gtf_file = f"{info['filename']}.gtf"
out_file = f"{info['filename']}_gene_symbols.txt"
#
gz_file = out_dir + gz_file
gtf_file = out_dir + gtf_file
out_file = out_dir + out_file

# === Step 2: Download the GTF file ===
print(f"\n⬇️ Downloading {species} GTF...")
with requests.get(url, stream=True) as r:
    r.raise_for_status()
    total_size = int(r.headers.get('content-length', 0))
    chunk_size = 8192
    with open(gz_file, 'wb') as f, tqdm(
        total=total_size, unit='B', unit_scale=True, unit_divisor=1024,
        desc=filename, initial=0
    ) as bar:
        for chunk in r.iter_content(chunk_size=chunk_size):
            if chunk:
                f.write(chunk)
                bar.update(len(chunk))

# === Step 3: Decompress ===
print("🧬 Decompressing...")
with gzip.open(gz_file, 'rb') as f_in, open(gtf_file, 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

# === Step 4: Parse gene names ===
print("🔍 Extracting gene names...")
gene_names = set()

with open(gtf_file, 'r') as f:
    for line in f:
        if line.startswith('#') or "\tgene\t" not in line:
            continue

        # Extract either gene_name or locus_tag
        name_match = re.search(r'gene_name "([^"]+)"', line)
        locus_match = re.search(r'locus_tag "([^"]+)"', line)

        if name_match:
            gene_names.add(name_match.group(1))
        elif locus_match:
            gene_names.add(locus_match.group(1))

# === Step 5: Save to file ===
with open(out_file, 'w') as f:
    for gene in sorted(gene_names):
        f.write(gene + '\n')

print(f"✅ Saved {len(gene_names)} gene names to {out_file}")

# Optional cleanup
os.remove(gz_file)
os.remove(gtf_file)

In [None]:
#=== Import plasmid common feature names exported from snapgene library ====#

#
directory = Path("../0.local/scrape-addgene/snapgene-features/snapgene_common_features.txt")
file_list = [f.name for f in directory.iterdir() if f.is_file()]
cleaned_list = [name.replace(".dna", "") for name in file_list]

#
print(cleaned_list)

# save to file
out_file = "../0.local/scrape-addgene/snapgene-features/snapgene_common_features.txt"
with open(out_file, 'w') as f:
    for feature in sorted(cleaned_list):
        f.write(feature + '\n')

print(f"✅ Saved {len(cleaned_list)} snapgene common features to {out_file}")

In [None]:
#=== Concatenate all files into a single file ===#

##
combined_items = []
#
file_list = [
    "./athaliana_gene_symbols.txt",
    "./celegans_gene_symbols.txt",
    "./dmelanogaster_gene_symbols.txt",
    "./drerio_gene_symbols.txt",
    "./ecoli_gene_symbols.txt",
    "./hsapiens_gene_symbols.txt",
    "./mmusculus_gene_symbols.txt",
    "./rnorvegicus_gene_symbols.txt",
    "./scerevisiae_gene_symbols.txt",
    "./spombe_gene_symbols.txt",
    "./xtropicalis_gene_symbols.txt",
    "./snapgene_common_features.txt"
]

# use a set to remove duplicates
combined_items = set()

# Read each file
for file_path in file_list:
    with open(file_path, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]
        combined_items.update(lines)

# Convert set back to sorted list
final_list = sorted(combined_items)

# Save to a new file
output_file = "combined_search_parameters.txt"
with open(output_file, 'w') as f:
    for item in final_list:
        f.write(item + '\n')

print(f"✅ Saved {len(final_list)} unique search parameters to {output_file}")