In [1]:
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px

import pickle, os, json, gc
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.metrics import precision_recall_curve, auc

os.environ["CUDA_VISIBLE_DEVICES"]="3"

from tensorflow import keras

from os import path

In [2]:
%run ../scripts/utils_keras.py

# Cross species testing on exome

In [4]:
species_list = ['Apis_mellifera', 'Bombus_terrestris', 'Nasonia_vitripennis']

In [4]:
exome_score_mean = {s:dict() for s in species_list}
exome_score_std = {s:dict() for s in species_list}

model_name = 'onehot_convo_19'   

for data_species in species_list : 
    
    X,y,seqid = get_data(f'../data/{data_species}/exome/r500_onehot.cdf')
    
    for model_species in species_list :
        
        model_path = f'../models/{model_species}/genome/r500_{model_name}_full'

        if model_species != data_species :
            s = test_ensemble(model_path, (X,y) )
        else:
            s = test_cv(model_path, (X, y, seqid))
            
        exome_score_mean[model_species][data_species] = s.mean()
        exome_score_std[model_species][data_species] = s.std()
            

exome/r500_onehot.cdf: size: 358084, pos: 56102, baseline: 0.16


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

mean: 0.93 std:0.01


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

mean: 0.90 std:0.00


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

mean: 0.81 std:0.01
exome/r500_onehot.cdf: size: 395072, pos: 37646, baseline: 0.10


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

mean: 0.79 std:0.01


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

mean: 0.85 std:0.02


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

mean: 0.73 std:0.01
exome/r500_onehot.cdf: size: 510918, pos: 57076, baseline: 0.11


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

mean: 0.77 std:0.02


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

mean: 0.79 std:0.01


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

mean: 0.87 std:0.01


In [5]:
df = pd.DataFrame.from_dict(exome_score_mean, orient='index').sort_index()
df.index.name = 'train'
df.columns.name = 'test'
df.round(2).to_latex('cross_species_exome.tex', caption='Cross species exome' )
df.round(2)

test,Apis_mellifera,Bombus_terrestris,Nasonia_vitripennis
train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apis_mellifera,0.93,0.79,0.77
Bombus_terrestris,0.9,0.85,0.79
Nasonia_vitripennis,0.81,0.73,0.87


In [6]:
df = pd.DataFrame.from_dict(exome_score_std, orient='index').sort_index()
df.index.name = 'train'
df.columns.name = 'test'
#df.round(2).to_latex('cross_species_exon_std.tex', caption='Cross species exome std' )
df.round(2)

test,Apis_mellifera,Bombus_terrestris,Nasonia_vitripennis
train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apis_mellifera,0.01,0.01,0.02
Bombus_terrestris,0.0,0.02,0.01
Nasonia_vitripennis,0.01,0.01,0.01


# Cross species testing on genome

In [5]:
genome_score_mean = {s:dict() for s in species_list}
genome_score_std = {s:dict() for s in species_list}

model_name = 'onehot_convo_19'   

for data_species in species_list : 
    
    X,y,seqid = get_data(f'../data/{data_species}/genome/r500_onehot.cdf')
    
    gc.collect()
    
    for model_species in species_list :
        
        keras.backend.clear_session()
        
        model_path = f'../models/{model_species}/genome/r500_{model_name}_full'

        if model_species != data_species :
            s = test_ensemble(model_path, (X,y), batch_size=1024 )
        else:
            s = test_cv(model_path, (X, y, seqid))
            
        genome_score_mean[model_species][data_species] = s.mean()
        genome_score_std[model_species][data_species] = s.std()
            

genome/r500_onehot.cdf: size: 3057492, pos: 70592, baseline: 0.02


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

mean: 0.86 std:0.04


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

mean: 0.78 std:0.01


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

mean: 0.61 std:0.01
genome/r500_onehot.cdf: size: 3241634, pos: 46664, baseline: 0.01


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

mean: 0.74 std:0.01


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

mean: 0.81 std:0.02


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

mean: 0.65 std:0.01
genome/r500_onehot.cdf: size: 2511364, pos: 75342, baseline: 0.03


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

mean: 0.60 std:0.02


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

mean: 0.64 std:0.01


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

mean: 0.78 std:0.04


In [6]:
df = pd.DataFrame.from_dict(genome_score_mean, orient='index').sort_index()
df.index.name = 'train'
df.columns.name = 'test'
df.round(2).to_latex('cross_species_genome.tex', caption='Cross species genome' )
df.round(2)

test,Apis_mellifera,Bombus_terrestris,Nasonia_vitripennis
train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apis_mellifera,0.86,0.74,0.6
Bombus_terrestris,0.78,0.81,0.64
Nasonia_vitripennis,0.61,0.65,0.78


In [7]:
df = pd.DataFrame.from_dict(genome_score_std, orient='index').sort_index()
df.index.name = 'train'
df.columns.name = 'test'
#df.round(2).to_latex('cross_species_exon_std.tex', caption='Cross species exome std' )
df.round(2)

test,Apis_mellifera,Bombus_terrestris,Nasonia_vitripennis
train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apis_mellifera,0.04,0.01,0.02
Bombus_terrestris,0.01,0.02,0.01
Nasonia_vitripennis,0.01,0.01,0.04


# Cross species testing on methylome

In [4]:
methylome_score_mean = {s:dict() for s in species_list}
methylome_score_std = {s:dict() for s in species_list}

model_name = 'onehot_convo_19'   
r=500

for data_species in species_list : 
    
    X,y,seqid = get_data(f'../data/{data_species}/methylome/r500_onehot.cdf', mode='methylome')
    
    for model_species in species_list :
        
        model_path = f'../models/{model_species}/methylome/r500_{model_name}_full'

        if model_species != data_species :
            s = test_ensemble(model_path, (X,y) )
        else:
            s = test_cv(model_path, (X, y, seqid))
            
        methylome_score_mean[model_species][data_species] = s.mean()
        methylome_score_std[model_species][data_species] = s.std()
            

methylome/r500_onehot.cdf: size: 139614, pos: 71886, baseline: 0.51


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

mean: 0.82 std:0.02


HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))

mean: 0.80 std:0.00


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

mean: 0.75 std:0.01
methylome/r500_onehot.cdf: size: 98192, pos: 52838, baseline: 0.54


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

mean: 0.79 std:0.00


HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))

mean: 0.80 std:0.02


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

mean: 0.75 std:0.00
methylome/r500_onehot.cdf: size: 685252, pos: 82216, baseline: 0.12


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

mean: 0.81 std:0.00


HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))

mean: 0.80 std:0.00


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

mean: 0.85 std:0.01


In [5]:
df = pd.DataFrame.from_dict(methylome_score_mean, orient='index').sort_index()
df.index.name = 'train'
df.columns.name = 'test'
df.round(2).to_latex('cross_species_methylome.tex', caption='Cross species methylome' )
df.round(2)

test,Apis_mellifera,Bombus_terrestris,Nasonia_vitripennis
train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apis_mellifera,0.82,0.79,0.81
Bombus_terrestris,0.8,0.8,0.8
Nasonia_vitripennis,0.75,0.75,0.85


In [6]:
df = pd.DataFrame.from_dict(methylome_score_std, orient='index').sort_index()
df.index.name = 'train'
df.columns.name = 'test'
#df.round(2).to_latex('cross_species_exon_std.tex', caption='Cross species exome std' )
df.round(2)

test,Apis_mellifera,Bombus_terrestris,Nasonia_vitripennis
train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apis_mellifera,0.02,0.0,0.0
Bombus_terrestris,0.0,0.02,0.0
Nasonia_vitripennis,0.01,0.0,0.01
