In [9]:
import csv

### Selection of genomic assemblies and their downloading

Parsing the list of genomes, obtained from NCBI FTP on June 2023

https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt

See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file.

#assembly_accession	bioproject	biosample	wgs_master	refseq_category	taxid	species_taxid	organism_name	infraspecific_name	isolate	version_status	assembly_level	release_type	genome_rep	seq_rel_date	asm_name	asm_submitter	gbrs_paired_asm	paired_asm_comp	ftp_path	excluded_from_refseq	relation_to_type_material	asm_not_live_date	assembly_type	group	genome_size	genome_size_ungapped	gc_percent	replicon_count	scaffold_count	contig_count	annotation_provider	annotation_name	annotation_date	total_gene_count	protein_coding_gene_count	non_coding_gene_count	pubmed_id


In [2]:
ftp = {}
latin = {}
names = []
rep = {}
with open('assembly_summary_genbank.txt') as file1:
    for line in file1:
        if not line.startswith('#') and 'ExonCaptKit' not in line:
            dat = line.strip().split('\t')
            if dat[20] == 'na' and dat[13] == 'Full':
                if dat[4] == 'representative genome':
                    rep[dat[7]] = dat[19]
                if dat[7] not in ftp:
                    ftp[dat[7]] = []
                ftp[dat[7]].append(dat[19])
                latin[dat[19].split('/')[-1]] = [dat[7],dat[6]]
                names.append(dat[7])
print(len(names),len(set(names)),len(rep))


2066 622 577


In [44]:
# Write down the path to the assembly, 'ftp_path' column
out = open('FTP_for_genomes','w')

for item in names:
    if item in rep and item != 'Canis lupus familiaris':
        out.write(rep[item] + '/' + rep[item].split('/')[-1] + '_genomic.fna.gz\n')
    else:
        for indi in ftp[item]:
            out.write(indi + '/' + indi.split('/')[-1] + '_genomic.fna.gz\n')

out.close()

# Run wget to download assemblies from the file
# for item in `cat FTP_for_genomes`; do wget "$item"; done

In [20]:
# Make the file (Namesystem) connecting genome IDs with species names and their taxid
# GCF_000151735.1_Cavpor3.0	Cavia porcellus	10141

genomes = []
with open('FTP_for_genomes') as file1:
    for line in file1:
        genomes.append(line.split('/')[-2])
        
out = open('Namesystem','w')
for item in genomes:
    out.write(item + '\t' + '\t'.join(latin[item]) + '\n')
out.close()

### Retrieving approximate genomic coordinates for NEAT1 and MALAT1 orthologs

In [1]:
#### Set of commands which were run in command line outside of python

# cut -f 1 Namesystem > liblist

# for i in `cat liblist`; do makeblastdb -in ./downloads/"$i"_genomic.fna -dbtype 'nucl'; 
# blastn -query known_NEAT1_MALAT1_and_features.fasta -db ./downloads/"$i"_genomic.fna -task blastn
# -out ./BLAST_features/"$i".txt -max_target_seqs 3  
# -outfmt "6 qseqid sseqid length pident qstart qend sstart send evalue bitscore" -max_hsps 3; done

In [2]:
with open('liblist') as file1:
    libs = file1.read().split('\n')[:-1]

In [4]:
NEAT = {}
MALAT = {}
# headers of sequences in 'known_NEAT1_MALAT1_and_features.fasta' grouped by types
# group of known NEAT1 orthologs
a = ['Human_NEAT1','Marsupials_NEAT1','Mouse_NEAT1']
# group of known MALAT1 orthologs
b = ['Human_MALAT1','Mouse_MALAT1']
# group of NEAT1 tRNA-like structures
c = ['AfterTriplex_Human_NEAT1','AfterTriplex_Marsupials_NEAT1','AfterTriplex_Mouse_NEAT1']
# group of MALAT1 promotor areas
d = ['MALAT_start_human','MALAT_start_mouse','MALAT_Tata']
# group of NEAT1 promotor areas
d1 = ['Marsupials_NEAT1_start','NEAT_Tata']

# Parsing blastn outputs to identify approximate coordinates of orthologs
for lib in libs:
    with open('./BLAST_features/' + lib + '.txt') as file1:
        N = {}
        M = {}
        N_start = {}
        M_start = {}
        N_stop = {}
        M_stop = {}
        NTata = {}
        MTata = {}
        NTrip = {}
        MTrip = {}
        
        for line in file1:
            dat = line.strip().split('\t')
            if dat[0] in a:
                if dat[1] not in N:
                    N[dat[1]] = 0
                N[dat[1]] += int(dat[2])
                if dat[4] == '1':
                    N_start[dat[1]] = dat[6]
            if dat[0] in b:
                if dat[1] not in M:
                    M[dat[1]] = 0
                M[dat[1]] += int(dat[2])
                if dat[4] == '1':
                    M_start[dat[1]] = dat[6]
            if dat[0] in d1 and int(dat[2]) > 30 and dat[1] not in NTata:
                NTata[dat[1]] = dat[6]
            if dat[0] in d and int(dat[2]) > 30 and dat[1] not in MTata:
                MTata[dat[1]] = dat[6]
            if dat[0] == 'AfterTriplex_Human_MALAT1' and dat[4] == '1' and dat[1] not in M_stop:
                M_stop[dat[1]] = dat[6]
            if dat[0] in c and dat[4] == '1' and dat[1] not in N_stop:
                N_stop[dat[1]] = dat[6]
            if dat[0] == 'NEAT_triplex' and dat[5] == '52' and dat[1] not in NTrip:
                NTrip[dat[1]] = dat[7]
            if dat[0] == 'MALAT_triplex' and dat[5] == '92' and dat[1] not in MTrip:
                MTrip[dat[1]] = dat[7]
                
        # Summarise the results of parsing
        Ncont = sorted(N.keys(), key=lambda kv: N[kv],reverse=True)[0]
        Mcont = sorted(M.keys(), key=lambda kv: M[kv],reverse=True)[0]
        if Ncont == Mcont:
            NEAT[lib] = {'contig': Ncont}
            MALAT[lib] = {'contig': Mcont}
        else:
            Ncont = Mcont
            NEAT[lib] = {'contig': 'FromMALAT_' + Ncont}
            MALAT[lib] = {'contig': Mcont}
            
        if Ncont in NTata:
            NEAT[lib]['start'] = NTata[Ncont]
        else:
            if Ncont in N_start:
                NEAT[lib]['start'] = N_start[Ncont]
            else:
                print(lib,'No NEAT start')
        if Ncont in MTata:
            MALAT[lib]['start'] = MTata[Ncont]
        else:
            if Ncont in M_start:
                MALAT[lib]['start'] = M_start[Ncont]
            else:
                print(lib,'No MALAT start')
        if Ncont in N_stop:
            NEAT[lib]['stop'] = N_stop[Ncont]
        else:
            if Ncont in NTrip:
                NEAT[lib]['stop'] = NTrip[Ncont]
            else:
                print(lib,'No NEAT stop')
        if Ncont in M_stop:
            MALAT[lib]['stop'] = M_stop[Ncont]
        else:
            if Ncont in MTrip:
                MALAT[lib]['stop'] = MTrip[Ncont]
            else:
                print(lib,'No MALAT stop')

GCA_000002285.4_Dog10K_Boxer_Tasha No MALAT start
GCA_000152225.2_Pcap_2.0 No MALAT start
GCA_000164785.2_C_hoffmanni-2.0.1 No NEAT start
GCA_000164785.2_C_hoffmanni-2.0.1 No MALAT start
GCA_000164785.2_C_hoffmanni-2.0.1 No NEAT stop
GCA_000181375.1_ASM18137v1 No NEAT start
GCA_000181375.1_ASM18137v1 No MALAT start
GCA_000181375.1_ASM18137v1 No NEAT stop
GCA_000181375.1_ASM18137v1 No MALAT stop
GCA_000181415.1_ASM18141v1 No NEAT start
GCA_000181415.1_ASM18141v1 No MALAT start
GCA_000181415.1_ASM18141v1 No NEAT stop
GCA_000181415.1_ASM18141v1 No MALAT stop
GCA_000331495.1_Beagle No NEAT start
GCA_000331495.1_Beagle No MALAT start
GCA_000331495.1_Beagle No NEAT stop
GCA_000331495.1_Beagle No MALAT stop
GCA_000465285.1_ASM46528v1 No NEAT start
GCA_000465285.1_ASM46528v1 No NEAT stop
GCA_000688575.1_CavAp1.0 No NEAT start
GCA_000688575.1_CavAp1.0 No MALAT start
GCA_000688575.1_CavAp1.0 No MALAT stop
GCA_000754665.1_Bison_UMD1.0 No NEAT start
GCA_000754665.1_Bison_UMD1.0 No NEAT stop
GCA_00

GCA_011754075.1_ASM1175407v1 No NEAT start
GCA_011754075.1_ASM1175407v1 No NEAT stop
GCA_012044875.1_ASM1204487v1 No MALAT start
GCA_012045015.1_ASM1204501v1 No MALAT start
GCA_013276365.2_UNSW_CanFamBas_1.2 No MALAT start
GCA_014363405.1_P_pselaphon_scaffold_01 No MALAT start
GCA_014364545.1_U_thibetanus_scaffold_01 No NEAT start
GCA_014364545.1_U_thibetanus_scaffold_01 No NEAT stop
GCA_014441545.1_ROS_Cfam_1.0 No MALAT start
GCA_015711505.1_Sylvilagus_bachmani_HiC No NEAT start
GCA_015711505.1_Sylvilagus_bachmani_HiC No MALAT start
GCA_016801295.1_DSBC_Mcra_1.0 No NEAT start
GCA_016801295.1_DSBC_Mcra_1.0 No NEAT stop
GCA_016801295.1_DSBC_Mcra_1.0 No MALAT stop
GCA_017311385.1_HRRL_Platanista_gangetica_1.0 No NEAT start
GCA_017311385.1_HRRL_Platanista_gangetica_1.0 No MALAT start
GCA_017311385.1_HRRL_Platanista_gangetica_1.0 No NEAT stop
GCA_017311455.1_Otocyon_megalotis_TS305_17_09_2019 No NEAT start
GCA_017311455.1_Otocyon_megalotis_TS305_17_09_2019 No MALAT start
GCA_017311455.1_Ot

GCF_018350155.1_O.geoffroyi_Oge1_pat1.0 No MALAT stop
GCF_019393635.1_mDroGli1.pri No MALAT start
GCF_900094665.1_CAROLI_EIJ_v1.1 No MALAT stop
GCF_902635505.1_mSarHar1.11 No MALAT start
GCF_902729225.1_Ma_sr-lr_union100 No MALAT start


In [5]:
# collect genomes without start or stop coordinates
left = []
for lib in NEAT:
    if not 'start' in NEAT[lib] or not 'stop' in NEAT[lib]:
        left.append(lib)

# Check for long (> 5kb) homology patches between known orthologs
# and genomic assemblies lacking stop or start (or both) coordinates

manual = []
for lib in left:
    with open('./BLAST_features/' + lib + '.txt') as file1:
        for line in file1:
            dat = line.strip().split('\t')
            if int(dat[2]) > 5000:
                manual.append(lib)
                break
                
print(len(manual), len(left))

#keep for the analysis only assemblies with clear homology
to_exclude = [i for i in left if i not in manual]

83 171


In [None]:
#write into file the results of coordinates search

#GenomeID Neat1ContigAccession Neat1Start Neat1Stop Neat1ApproxLength Malat1ContigAccession Malat1Start
#Malat1Stop Malat1ApproxLength

#GCA_000283155.1_CerSimSim1.0	JH767791.1	9929556	9952059	22503	JH767791.1	9997904	10005062	7158

out = open('coordinates_algorithmic','w')
for lib in libs:
    if lib not in to_exclude:
        l = [lib]
        if lib in NEAT:
            l.append(NEAT[lib]['contig'])
            if 'start' in NEAT[lib]:
                l.append(NEAT[lib]['start'])
            else:
                l.append('Nan')

            if 'stop' in NEAT[lib]:
                l.append(NEAT[lib]['stop'])
            else:
                l.append('Nan')
            if 'start' in NEAT[lib] and 'stop' in NEAT[lib]:
                l.append(str(abs(int(NEAT[lib]['start']) - int(NEAT[lib]['stop']))))
            else:
                l.append('Nan')
        else:
            l += ['Nan','Nan','Nan']
        if lib in MALAT:
            l.append(MALAT[lib]['contig'])
            if 'start' in MALAT[lib]:
                l.append(MALAT[lib]['start'])
            else:
                l.append('Nan')

            if 'stop' in MALAT[lib]:
                l.append(MALAT[lib]['stop'])
            else:
                l.append('Nan')
            if 'start' in MALAT[lib] and 'stop' in MALAT[lib]:
                l.append(str(abs(int(MALAT[lib]['start']) - int(MALAT[lib]['stop']))))
            else:
                l.append('Nan')
        else:
            l += ['Nan','Nan','Nan']
        out.write('\t'.join(l) + '\n')

out.close()

### Manual curation of approximate coordinates
After that we performed a manual curation of the cases where start or stop coordinate was missing by looking at the results of blastn search. In some cases the contig assembly was incomplete and a gene was lacking 5' or 3' end, if the majority of the gene was preserved we set the missing coordinate to 0 and kept the ortholog in the dataset. In other cases we could clearly see many patches of homology to known orthologs and only the promotor area did not give the hit in blastn search; if we could approximate the location of the ortholog, we set the start coordinate manually. 

### Assigning taxonomic tree to the species with identified orthologs after manual curation

In [29]:
# ManualCuration file is the 'coordinates_algorithmic' after manually added start/stop coordinates
sel = {}
with open('ManualCuration') as file1:
    for line in file1:
        if not line.startswith('#'):
            dat = line.strip().split('\t')
            sel[dat[0]] = {}
            sel[dat[0]]['coor'] = dat[1:]
print(len(sel))            

544


In [30]:
#Names of known NEAT1 and MALAT1 orthologs
ref_names = ['NEAT1_Mouse','NEAT1_Marsupials','NEAT1_Human','MALAT1_Mouse','MALAT1_Human']

In [31]:
# Retrieve taxid for mammals with identified orthologs
with open('Namesystem') as file1:
    for line in file1:
        dat = line.strip().split('\t')
        if dat[0] in sel:
            sel[dat[0]]['sc_name'] = dat[1]
            sel[dat[0]]['taxid'] = dat[2]
        if dat[0] in ref_names:
            sel[dat[0]] = {}
            sel[dat[0]]['sc_name'] = dat[1]
            sel[dat[0]]['taxid'] = dat[2] 
taxid_list = [sel[i]['taxid'] for i in sel]  

Below files (*.dpm) are part of the taxonomy package from NCBI
https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

In [33]:
#Function to loop through the taxonomy tree
def parse_nodes(taxid, data_dict):
    if taxid in parent_dict and taxid in rank_dict:
        while parent_dict[taxid] != taxid:
            data_dict[rank_dict[taxid]] = names_dict[taxid]
            taxid = parent_dict[taxid]
    else:
          data_dict = {}  
    return(data_dict)
categories = ["species", "genus", "family", "order", "class"]

In [34]:
# Collect common and scientific names from NCBI files and hierarchy of taxa
comname = {}
rank_dict = {}
parent_dict = {}
names_dict = {}
with open('/home/ksenia/databases/Taxonomy/names.dmp') as names_file:
    names = csv.reader(names_file,delimiter='\t',quoting=csv.QUOTE_NONE)     
    for line in names:
        if line[6] == 'scientific name':
            names_dict[line[0]] = line[2]
        if 'common name' in line[6]:
            if line[0] in taxid_list:
                comname[line[0]] = line[2]

with open('/home/ksenia/databases/Taxonomy/nodes.dmp','r')as nodes_file:
    nodes = csv.reader(nodes_file,delimiter='\t',quoting=csv.QUOTE_NONE)
    for line in nodes:
        parent_dict[line[0]] = line[2]
        rank_dict[line[0]] = line[4]

In [36]:
# Build a taxonomy tree for individual genome of our collection
taxonomy = {}
for item in taxid_list:
    taxonomy[item] = {}
    z = parse_nodes(item, {})
    if z:
        for cat in categories:
            if cat in z.keys():
                taxonomy[item][cat] = z[cat] 
            else:
                taxonomy[item][cat] = 'NA'
    else:
        for cat in categories:
            taxonomy[item][cat] = 'NA'


In [38]:
# Create the first metadata file
out = open('./Key_Files/Metadata','w')
out.write('#File_name\tScientific_name\tCommon_name\ttaxid\t' + '\t'.join(categories[::-1]) + 
          'NEAT1Contig\tNEAT1start\tNEAT1stop\t' + 
          'MALAT1Contig\tMALAT1start\tMALAT1stop\n')
for item in sel:
    l = [item, sel[item]['sc_name']]
    if sel[item]['taxid'] in comname:
        l.append(comname[sel[item]['taxid']])
    else:
        l.append('Nan')
    l.append(sel[item]['taxid'])
    for cat in categories[::-1]:
        l.append(taxonomy[sel[item]['taxid']][cat])
    l += sel[item]['coor']
    out.write('\t'.join(l) + '\n')
    
out.close()