In [2]:
import yaml
import itertools
import re
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from src.config import read_yaml_config, get_value_from_config
from src.io import read_fasta_as_dict, get_augustus_proteins, read_predictions, read_predictions_from_fasta
from src.visual import visualize_predictions, visualize_single_prediction


warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [3]:
config = read_yaml_config("config.yaml")

In [4]:
TARGET_SPECIES = "Andrena_dorsata"

In [5]:
TARGET_SPECIES = TARGET_SPECIES.replace(" ", "_")
genome_file = Path(config['data_locations']['genomes_dir']) / f'{TARGET_SPECIES}.fna'
proteome_file = Path(config['data_locations']['proteomes_dir']) / f'{TARGET_SPECIES}.faa'
predictions_file = Path(config['data_locations']['scored_predictions_dir']) / f'{TARGET_SPECIES}.tsv'
genes_file = Path(config['data_locations']['gene_predictions_dir']) / f'{TARGET_SPECIES}.fna'
annotation_file = Path(config['data_locations']['gene_annotations_dir']) / f'{TARGET_SPECIES}.gff'
false_positives_file = Path(config['data_locations']['false_positives_dir']) / f'{TARGET_SPECIES}.faa'

In [6]:
genome_map = read_fasta_as_dict(genome_file)
proteome_map = read_fasta_as_dict(proteome_file)
genes_map = read_fasta_as_dict(genes_file)
fp_map = read_fasta_as_dict(false_positives_file)
predictions_data = read_predictions(predictions_file)

In [7]:
predictions_data.head(15)

Unnamed: 0,t_record_id,g_record_id,pos_count,score,model_name,frame,record_description,prediction_mask,sequence
0,OV815488.1_start=69205000_end=69235000_frame=+1,OV815488.1_start=69205000_end=69235000_frame=+,112,282.0,HybridModel_1000_cuda_v1,1,OV815488.1_start=69205000_end=69235000_frame=+...,"[False, False, False, False, False, False, Fal...",MDTSGINVHVQNRREYKPKMCPF*RILVPNYLLCN*SLKVLIRHFD...
1,OV815488.1_start=69205000_end=69235000_frame=+3,OV815488.1_start=69205000_end=69235000_frame=+,100,282.0,HybridModel_1000_cuda_v1,3,OV815488.1_start=69205000_end=69235000_frame=+...,"[False, False, False, False, False, False, Fal...",GHVGH*RPRPKQKGI*AKDVSLLANISA*LSAM*LISESFNQTL*S...
2,OV815488.1_start=69205000_end=69235000_frame=+2,OV815488.1_start=69205000_end=69235000_frame=+,80,282.0,HybridModel_1000_cuda_v1,2,OV815488.1_start=69205000_end=69235000_frame=+...,"[False, False, False, False, False, False, Fal...",WTRRALTSTSKTEGNISQRCVPFSEY*CLTICYVIDL*KF*SDTLI...
3,CAKMYH010000097.1_start=145000_end=175000_fram...,CAKMYH010000097.1_start=145000_end=175000_frame=+,16,16.0,HybridModel_1000_cuda_v1,1,CAKMYH010000097.1_start=145000_end=175000_fram...,"[False, False, False, False, False, False, Fal...",PGTRQRMGIDENHESNRQCQQVQSAARGMLRQVAAYIVFEESHN*R...
4,CAKMYH010000097.1_start=295000_end=313107_fram...,CAKMYH010000097.1_start=295000_end=313107_frame=+,16,16.0,HybridModel_1000_cuda_v1,2,CAKMYH010000097.1_start=295000_end=313107_fram...,"[False, False, False, False, False, False, Fal...",GLINFVAPGRRRVGRNWDE*AFIQR*EASEF*TPFPPRYDGKGVEF...
5,OV815488.1_start=7825000_end=7855000_frame=-1,OV815488.1_start=7825000_end=7855000_frame=-,13,13.0,HybridModel_1000_cuda_v1,1,OV815488.1_start=7825000_end=7855000_frame=-1 ...,"[False, False, False, False, False, False, Fal...",*RNG*RRLIRQNGKWKKRNGIL*SFVTLAAFRIRVLLLVINVFA*A...
6,OV815488.1_start=40615000_end=40645000_frame=+1,OV815488.1_start=40615000_end=40645000_frame=+,12,11.0,HybridModel_1000_cuda_v1,1,OV815488.1_start=40615000_end=40645000_frame=+...,"[False, False, False, False, False, False, Fal...",ERIIRLKFPLVARADPPGVQLFSDICLQATT**GRAIWKHYPRL*F...
7,OV815487.1_start=65965000_end=65995000_frame=-3,OV815487.1_start=65965000_end=65995000_frame=-,8,7.0,HybridModel_1000_cuda_v1,3,OV815487.1_start=65965000_end=65995000_frame=-...,"[False, False, False, False, False, False, Fal...",LLRRLLIYICRCILNGKCLISC*LDLVIIRRTGIVGVCYGRLRKSS...
8,OV815488.1_start=25375000_end=25405000_frame=+3,OV815488.1_start=25375000_end=25405000_frame=+,7,6.0,HybridModel_1000_cuda_v1,3,OV815488.1_start=25375000_end=25405000_frame=+...,"[False, False, False, False, False, False, Fal...",EIGENPTYRGIARGKERSDLRRDSFDCLLGEVKTALRVNEGLSFAE...
9,OV815487.1_start=77215000_end=77245000_frame=-1,OV815487.1_start=77215000_end=77245000_frame=-,5,4.0,HybridModel_1000_cuda_v1,1,OV815487.1_start=77215000_end=77245000_frame=-...,"[False, False, False, False, False, False, Fal...",NDTVSLTYGGGGVWGGGGLYLRGCVLIPIYQSSDRNVKGSDTSHMC...


In [8]:
SCORE_THRESHOLD = 15

selected_data = predictions_data.query("pos_count > @SCORE_THRESHOLD")

# Visualize predictions

In [9]:
SHOW_NUCLEOTIDE = True
SHOW_TRANSLATED = True
SHOW_MISSING_FRAMES = False
NUCL_VISUAL_OFFSET = np.array((1500, 500))
AA_VISUAL_OFFSET = NUCL_VISUAL_OFFSET // 3


visualize_predictions(
    selected_data,
    genome_map=genome_map,
    proteome_map=proteome_map,
    show_translated=SHOW_TRANSLATED,
    show_nucleotide=SHOW_NUCLEOTIDE,
    show_missing_frames=SHOW_MISSING_FRAMES,
    nucleotide_offset=NUCL_VISUAL_OFFSET,
    translated_offset=AA_VISUAL_OFFSET,
    translated_caption_template="{t_record_id} Score={score}",
    nucleotide_caption_template="{g_record_id} Score={score}"
)

OV815488.1_start=69205000_end=69235000_frame=+1 Score=282.0 FragmentStart=0_FragmentEnd=1113
[31mNMR*IFKLK*MASYDAINMIYTVLGHGNISRMGHDSSFYWQIFTINFFLMTPNIYRTYTKNTTRWT**YVLTFENKQHFLNYKGRNKIAVPNISASTQII*HENIHILLHAIFQNSEVDGLNN*WF*LKYKLDISNIYSMQF*IIF*YCKWNNCSKMVSILLRVSITRAFYENFVLQELNIFFILTFNKISEYNVKNTNILQLQHTISTIITKRY*H*KEMDTGKLMKKHLCRVNGGSLPDAAQP*TFTQYKLCKLPVNILYKYCCRVSGRSVHPDLQS*VRGKKLLVKY*L*VHLRASHSRSISSISI**VKFLKLIYLFSTRERNFIFDIIFFL*QITHYFDIGVNIILLSV*DTHIDHEAIYDLDNFGSHLHDSTWRIRGCRTTQTTYSSGNVKFKIFYIFAYE**NSVMNIDQILERFMIINE*TFFFILAGSPRTS*SRSQTNLVSCATTSLCSSATTTSSSKYIKIFYLARDIKSNYLNTVHYLLQRLRREAEP[0m[32mEAEPGSKPFNSAPRPVYVPP[0m[31mP[0m[32mRP[0m[31mPH[0m[32mP[0m[31mVN[0m[32mIL[0m[31mQIYT*HRY*DH*IKYFEFVCYSA*DAKQNQKLNPVQNLLILRHDQFMFPRHDHRTP*IHITNLYLAPLLSPLN*IL*ICLL[0m[32mQ[0m[31mRLRREAEP[0m[32mEAEPGSKPFNPAPRPVYVPPPRP[0m[31mPH[0m[32mP[0m[31mVN[0m[32mIL[0m[31mQIYT*HRY*DH*IKYCEFVCYSA*DAKQNQKLNPVQNLLILRHDQFMFPRHDHRTP*IHITNLYLAPLLSPLN*IL*ICLL[0m[32mQ[0m[31mRLRREAEP

# False Positives

## Overview

In [18]:
SHOW_NUCLEOTIDE = False
SHOW_TRANSLATED = True
SHOW_MISSING_FRAMES = False
NUCL_VISUAL_OFFSET = None
AA_VISUAL_OFFSET = None

fp_predictions = read_predictions_from_fasta(false_positives_file)

visualize_predictions(
    fp_predictions,
    genome_map=genome_map,
    proteome_map=proteome_map,
    show_translated=SHOW_TRANSLATED,
    show_nucleotide=SHOW_NUCLEOTIDE,
    show_missing_frames=SHOW_MISSING_FRAMES,
    nucleotide_offset=NUCL_VISUAL_OFFSET,
    translated_offset=AA_VISUAL_OFFSET,
    translated_caption_template="{t_record_id} Score={score}",
    nucleotide_caption_template="{g_record_id} Score={score}"
)

CAKMYH010000097.1_start=145000_end=175000_frame=+1 Score=16.0 FragmentStart=0_FragmentEnd=48
[31mEAGLKNIKQKYVQQQN[0m[32mFFRFFLFPRSLEISHA[0m[31mHRAYTASRHTHMHFFP[0m
                                                                                                                        
CAKMYH010000097.1_start=295000_end=313107_frame=+2 Score=16.0 FragmentStart=0_FragmentEnd=48
[31mRI*SRSMCVCATQQNF[0m[32mFRFFLFPRSLEISHAH[0m[31mRAYTASRHTHMHLFPR[0m
                                                                                                                        
OV815488.1_start=7825000_end=7855000_frame=-1 Score=13.0 FragmentStart=0_FragmentEnd=46
[31mILLHYEAIDHIFSRFI[0m[32mFNLRMFPRSLE[0m[31mS[0m[32mNR[0m[31mFKYSHYRNLRKLRV*K[0m
                                                                                                                        
OV815488.1_start=40615000_end=40645000_frame=+1 Score=11.0 FragmentStart=0_FragmentEnd=45
[31mASALDARPAAPVQLLC[0m[32

## Detailed

In [25]:
TARGET_RECORD_ID = "CAKMYH010000097.1_start=145000_end=175000_frame=+"
SHOW_NUCLEOTIDE = True
SHOW_TRANSLATED = True
SHOW_MISSING_FRAMES = True
NUCL_VISUAL_OFFSET = np.array((1500, 500))
AA_VISUAL_OFFSET = NUCL_VISUAL_OFFSET // 3


fp_selected_data = predictions_data[predictions_data.t_record_id.str.startswith(TARGET_RECORD_ID)]


visualize_predictions(fp_selected_data, genome_map, proteome_map, show_translated=SHOW_TRANSLATED, show_nucleotide=SHOW_NUCLEOTIDE, show_missing_frames=SHOW_MISSING_FRAMES, nucleotide_offset=NUCL_VISUAL_OFFSET)

CAKMYH010000097.1_start=145000_end=175000_frame=+1 FragmentStart=0_FragmentEnd=681
[31mPGGGGAAASVPRGMPPPSRMISPYPATIRSGTKSRTEYCDQAGEGGGGRCLLISPSVCDT*L*SDKEACPSTELFLIHRAVVRSADIISMKDLPPCLWCMKVDPALTLMGTSPINASASQQACPLRSKVLPPPPPYRGLGVKVTGHDNAPCARSCFPDQLRQRFQVRRKLLEINVGGNVPTNERYVPYHRGREPLH*QLGGTGSAGVATFY*GRSARLRGRNIRRRWCTLANGTTNGPPGVSRTDGSQPRRR*ISPFDLLGFSGLPLNGFTYS*TLSSKFFSTFPHGTCSLSVSWLYLALDGVYHPLRVALSSNPTLWRDPPETRTGRYGPGTLSG*VAPFKMDLDAVRRHGINGSS*TLQFPAAEPRDSVLG*FLFARRY*GNPC*FLFLRLVICLNSAGNLAYSEVVNRYYYMYHQRKKTETARSKEKIYYISSWE*RARSRLRYPSFGYCTSNGSPGGSTGCARVVASIHRRDGAGPGAGREAGLKNIKQKYVQQQN[0m[32mFFRFFLFPRSLEISHA[0m[31mHRAYTASRHTHMHFFPRTHPGS*PETRTTDRCDRSRVVVIQRCHHTIFQRKGINTGLFSSNREN*SFPLPLIAPRQSVFSSRFVDSTRVETLLAERPPVFTLTSVRYY*L*DSAIPSRLEQRLAAEGTVEK*AFPWIGRPARPAGSLVRSTKNATKTWGDRGDSF[0m
CAKMYH010000097.1_start=145000_end=175000_frame=+2 FragmentStart=0_FragmentEnd=9999
[31mQERDSGWALTRIMNLTVNVNKYNPLRAGCYVKLPRILSLKKATISVQSPDNACFAWAVVAALHPVERNADRPSSYPPYASVLNLQGIEFPMSLEQIGKFERQNEISVNVYTFDEEKGST

# Proteins

In [26]:
records = get_augustus_proteins(annotation_file)
for idx, record in enumerate(records):
    print(idx, record.id, record.description)
    print(record.seq)
    print()

FileNotFoundError: [Errno 2] No such file or directory: 'predictions/annotations/Andrena_dorsata.gff'

# Commit proteins

In [21]:
def commit_proteins(proteins_file, new_records, force=False):
    all_proteins = {rec.id: rec for rec in SeqIO.parse(proteins_file, "fasta")}
    for record in new_records:
        if record.id in all_proteins and not force:
            warnings.warn(f"Protein {record.id} is already commited. Skipping...", category=UserWarning)
            continue
        all_proteins[record.id] = record
    return SeqIO.write(list(all_proteins.values()), proteins_file, "fasta")

## Commit auto

In [22]:
subset_to_commit = [0, 1, 2]
records_to_commit = [records[idx] for idx in subset_to_commit]
for idx, record in enumerate(records_to_commit):
    print(idx, record.id, record.description)
    print(record.seq)
    print()

0 RJVV01031371.1_start=0_end=30000_frame=+ Gene_start=1393 Gene_end=3525 Auto
MAKISAFLIVALFAFAALSVQAEPEPARGGKPSRPRPPPIKPRPPHPRLRREAEGLEEDVAEVETDEVEESAVALDRVRREPGRPGNMPRPKPIPIRPRPPHPRLRREAEELEAEDVLPLERLRREAEELEAEDLEADEVLPLERVRREPGRPGNMPRPKPIPIRPRPPHPRLRREAEELEVEDLEADDVLPLERVRREPGRPGNMPRPKPIPIRPRPPHPVSIIFIENE

1 RJVV01183868.1_start=0_end=30000_frame=- Gene_start=28 Gene_end=535 Auto
MPRPKPIPIRPRPPHPRLRREAEELEVEDLEADDVLPLERVRREPGRPGNMPRPKPIPIRPRPPHPVSTIFIENE

2 RJVV01026129.1_start=0_end=30000_frame=+ Gene_start=1395 Gene_end=2603 Auto
MAKISAFLIVALFAFAALSVQAEPEPARGGKPSRPRPPPIKPRPPHPRLRREAEGLEEDVAEVETDEVEESAVALDRVRREPGRPGNMPRPKPIPIRPRPPHPRLRREAEELEAEDVLPLESPSRTRPSRKYATT



In [23]:
commit_proteins("hymenoptera_proteins.faa", records_to_commit)

3

## Commit custom

In [168]:
custom_record = SeqRecord(
    Seq(""),
    id="",
    description=f"Gene_start={} Gene_end={} Manual"
)
custom_record

SyntaxError: f-string: empty expression not allowed (1575694441.py, line 5)

In [None]:
# commit_proteins("hymenoptera_proteins.faa", [custom_record])