In [6]:
#=== Installs and imports ===#
import pipeline_utils
import os
import gzip
import requests
import csv

## 
import pandas as pd

##
import matplotlib.pyplot as plt
import matplotlib.patches as patches

In [2]:



#=== Identify converging genes ===#
# Note, proximity is measured by 
def identify_converging_genes(fasta_url, gff3_url, output_dir, gene_proximity = 1000, exclude_overlap=False, verbose=True):

    ##
    import pipeline_utils

    ## Log the function and params
    if verbose:
        print("Called identify_converging_genes function")
        print(f"Target fasta url: {fasta_url}")
        print(f"Target gff3 url: {gff3_url}")
        print(f"Output directory for genome files: {output_dir}")
        print(f"Minimum distance between gene pairs: {gene_proximity}")

    ## Instantiate the dataframe
    df_gene = pd.DataFrame(columns=["gene", "chromosome", "start", "end", "strand"])

    ## Download the required files, skip if already present
    os.makedirs(output_dir, exist_ok=True)
    fasta_path = os.path.join(output_dir, os.path.basename(fasta_url))
    gff3_path = os.path.join(output_dir, os.path.basename(gff3_url))

    ##
    if verbose:
        print(f"Destination filepath for fasta: {fasta_path}")
        print(f"Destination filepath for gff3: {gff3_path}")

    ## Download the required genome and annotation files ##
    if verbose:
        print("Attempting file downloads")
    ##
    pipeline_utils.download_file(fasta_url, fasta_path)
    pipeline_utils.download_file(gff3_url, gff3_path)

    ## Parse the GFF3 file
    if verbose:
        print("Parsing GFF3 file")
    ##
    genes = pipeline_utils.parse_gff3_genes(gff3_path)

    ## Find facing pairs
    if verbose:
        print("Identifying converging gene pairs")
    ##
    facing_pairs = pipeline_utils.find_facing_gene_pairs(genes, 1000, exclude_overlap=exclude_overlap, verbose=verbose)

    ##
    return(facing_pairs)



######
##
fasta_url = "http://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
gff3_url = "http://ftp.ensembl.org/pub/release-111/gff3/homo_sapiens/Homo_sapiens.GRCh38.111.gff3.gz"
output_dir = "../0.local/generic-single-cell-pipeline/genome_data"
max_proximity_bp = 1000

##
gene_pairs = identify_converging_genes(fasta_url = fasta_url, gff3_url = gff3_url, output_dir = output_dir)

#for g1, g2 in facing_pairs:
#    print(f"{g1.name} ({g1.chrom}:{g1.start}-{g1.end} {g1.strand}) ↔ {g2.name} ({g2.chrom}:{g2.start}-{g2.end} {g2.strand})")

Called identify_converging_genes function
Target fasta url: http://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
Target gff3 url: http://ftp.ensembl.org/pub/release-111/gff3/homo_sapiens/Homo_sapiens.GRCh38.111.gff3.gz
Output directory for genome files: ../0.local/generic-single-cell-pipeline/genome_data
Minimum distance between gene pairs: 1000
Destination filepath for fasta: ../0.local/generic-single-cell-pipeline/genome_data/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
Destination filepath for gff3: ../0.local/generic-single-cell-pipeline/genome_data/Homo_sapiens.GRCh38.111.gff3.gz
Attempting file downloads
Called download_file function
Target url: http://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
Destination filepath: ../0.local/generic-single-cell-pipeline/genome_data/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
File already exists, exiting
Called download_file fu

In [4]:
#print(type(gene_pairs))
#print(gene_pairs)
#print(len(gene_pairs))
#print(gene_pairs[0])

print(type(gene_pairs))
print(type(gene_pairs[0]))
#print(type(gene_pairs[1]))
print(type(gene_pairs[0][0]))
#print(type(gene_pairs[0][1]))
#
#print(gene_pairs)
print(gene_pairs[0])
print(gene_pairs[1])
print(gene_pairs[0][0])
print(gene_pairs[0][1])

<class 'list'>
<class 'tuple'>
<class 'dict'>
({'id': 'gene:ENSG00000156875', 'gene': 'MFSD14A', 'chrom': '1', 'start': '100038095', 'end': '100083377', 'strand': '+'}, {'id': 'gene:ENSG00000156876', 'gene': 'SASS6', 'chrom': '1', 'start': '100083563', 'end': '100132955', 'strand': '-'})
({'id': 'gene:ENSG00000187608', 'gene': 'ISG15', 'chrom': '1', 'start': '1001138', 'end': '1014540', 'strand': '+'}, {'id': 'gene:ENSG00000156876', 'gene': 'SASS6', 'chrom': '1', 'start': '100083563', 'end': '100132955', 'strand': '-'})
{'id': 'gene:ENSG00000156875', 'gene': 'MFSD14A', 'chrom': '1', 'start': '100038095', 'end': '100083377', 'strand': '+'}
{'id': 'gene:ENSG00000156876', 'gene': 'SASS6', 'chrom': '1', 'start': '100083563', 'end': '100132955', 'strand': '-'}


In [7]:
# Open the output CSV file
with open("converging_genes.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)

    # Optional: Write a header row
    # Assuming all dictionaries have the same keys
    keys1 = gene_pairs[0][0].keys()
    keys2 = gene_pairs[0][1].keys()
    header = [f"1_{k}" for k in keys1] + [f"2_{k}" for k in keys2]
    writer.writerow(header)

    # Write data rows
    for dict1, dict2 in gene_pairs:
        row = list(dict1.values()) + list(dict2.values())
        writer.writerow(row)