### Objective
Compare the fitness scores of TrpBs generated by pooled fitness assay vs scores predicted by the transceptEVE model

In [3]:
import pandas as pd
import numpy as np
import holoviews as hv
import hvplot.pandas
import sys
import bokeh
from scipy.stats import pearsonr
from scipy.stats import spearmanr

hv.extension('bokeh')
bokeh.io.output_notebook()

path_to_common = '/home/redwood/Desktop/data_tools/maple/rules/utils'
sys.path.append(path_to_common)
from common import export_svg_plots, conspicuous_mutations

defaults = {
    'fontsize': dict(title=20, labels=18, ticks=16, legend=18)
}

hv.opts.defaults(
    hv.opts.Curve(**defaults),
    hv.opts.Scatter(**defaults),
    hv.opts.HexTiles(**defaults)
    # Add other elements as needed
)

predictions_raw = pd.read_csv('tmtrpb_muts_to_score.csv')

predictions_raw

Unnamed: 0,mutated_sequence,avg_score_L_to_R,avg_score_R_to_L,avg_score,mutant
0,MKGYFGPYGGQYVPEILMPALKELEAVYEEIMKDESFWKEFNDLLR...,-0.064334,-0.062722,-0.063528,E22K:A27V:F57L:A58T:N167D:R171H:A321T:L379V
1,MKGYFGPYGGQYMPEILMPALEELEAAYEEIMKDESFWKEFNDLLR...,-0.112213,-0.108218,-0.110216,V13M:P52Q:A94T:T106A:G123D:N167D:V183M:V245A:A...
2,MKGYFGPYGGQYVPEILMPALEELEAAYEEIMKDESFWKEFNDLLR...,-0.100681,-0.097335,-0.099008,G50D:P54T:V91I:A94V:R171H:V183M:K278R:V281I:Y3...
3,MKGYFGPYGGQYVPEILMPALEELETAYEEIMKDESFRKEFNDLLC...,-0.053658,-0.052331,-0.052995,A26T:W38R:R46C:A94V:V183M:F240L:A321G:R385H
4,MKGYFGPYGGQYVPEILMPALEELEAAYEEIMKDESFWKEFNDLLR...,-0.046803,-0.046007,-0.046405,A94T:G123D:N167D:V183M:V281I:F331L:K359T
...,...,...,...,...,...
46391,MKGYFGPYGGQYVSEILMSTLVELEAAYEEIVEDESFWKEFNDLLH...,-0.362277,-0.353126,-0.357702,P14S:P19S:A20T:E22V:M32V:K33E:R46H:G50D:P54T:A...
46392,MKGYFGPYGGQHVSEILMPTLEELEAVYEEIMKGESFWKEFNDLLC...,-0.320426,-0.313836,-0.317131,Y12H:P14S:A20T:A27V:D34G:R46C:G50V:R59S:G66D:A...
46393,MKGYFGPYGGQYVSEILMSTLEELEAVYEEIMKDGSFWKEFNDLLH...,-0.342881,-0.327898,-0.335390,P14S:P19S:A20T:A27V:E35G:R46H:G50D:P54T:G66D:R...
46394,MKGYFGPYGGQYVPEILMPTLEELEAAYEEIMKDESFWKEYNDLLY...,-0.183344,-0.184319,-0.183831,A20T:F41Y:R46Y:D47N:P52T:F57Y:R60H:G66A:R68H:A...


In [4]:
# count num of nonsynonymous mutations as number of ':' in the mutant column + 1
predictions = (predictions_raw.assign(AA_substitutions_nonsynonymous_count = predictions_raw['mutant'].str.count(':') + 1)
                                .sort_values(['AA_substitutions_nonsynonymous_count', 'mutant'], ascending=True)
                                .rename(columns={'avg_score': 'tranceptEVE_score'})
)
predictions

Unnamed: 0,mutated_sequence,avg_score_L_to_R,avg_score_R_to_L,tranceptEVE_score,mutant,AA_substitutions_nonsynonymous_count
1628,MKGYFGPYGGQYVPEILMPTLEELEAAYEEIMKDESFWKEFNDLLR...,-0.030438,-0.031293,-0.030865,A20T:E214K:V245A:L267S:I271V:Q288R,6
6029,MKGYFGPYGGQYVPEILMPVLEELEAAYEEIMKDESFRKEFNDLLR...,-0.050370,-0.049755,-0.050063,A20V:W38R:G50S:A118T:V142I:F331L,6
712,MKGYFGPYGGQYVPEILMPALEELEAAYEEIMKDESFWKEFNDLLR...,-0.060673,-0.060771,-0.060722,A58T:G158D:V250I:H262Y:I271V:T292S,6
31453,MKGYFGPYGGQYVPEILMPALEELEAAYEEIMKDESFWKEFNGLLR...,-0.055501,-0.055541,-0.055521,D43G:G89S:R171H:A321T:R336H:K359E,6
6034,MKGYFGPYDGQYVPEILMPTLEELEAAYEEIMKDESFWKEFNDLLR...,-0.106535,-0.101176,-0.103855,G9D:A20T:A58T:R59P:A233T:F240S,6
...,...,...,...,...,...,...
31328,MKGYFGPYGGQYMSGILMPTLEELEDAYEEIMKDESFWKEFNGLLR...,-0.292953,-0.293284,-0.293118,V13M:P14S:E15G:A20T:A26D:D43G:P52T:F57Y:A58T:R...,36
46347,MKGYFGPYGGQYMSEILIPTLEELETAYEEIMKDEFFWKEYNDLLY...,-0.291574,-0.286946,-0.289260,V13M:P14S:M18I:A20T:A26T:S36F:F41Y:R46Y:D47N:P...,36
31351,MKGYFGPYGGQYVSEILMSTLEELEAAYEEIMKDGSFWKEYNDLLY...,-0.326805,-0.316314,-0.321559,P14S:P19S:A20T:E35G:F41Y:R46Y:D47N:P52T:F57Y:R...,37
21381,MKGYFGPYGGQYVPEILMSTLEELEAAYEEIMKDESFWKEYNDLLY...,-0.355419,-0.354255,-0.354837,P19S:A20T:F41Y:R46Y:D47N:P52T:F57Y:R60H:G66A:R...,38


In [5]:
fitness_scores_raw = pd.read_csv('dashboard-all-seqs-2023-10-10_16-43-33_genotypes.csv')

In [6]:

# Drop rows with values in 'NT_insertions' or 'NT_deletions' columns
fitness_scores = fitness_scores_raw[fitness_scores_raw['NT_insertions'].isnull() & fitness_scores_raw['NT_deletions'].isnull()]
# drop anything that gets changed to a stop codon
fitness_scores = (fitness_scores[~fitness_scores.AA_substitutions_nonsynonymous.str.contains(("\*"))].reset_index(drop=True)
                                .sort_values(['AA_substitutions_nonsynonymous_count', 'AA_substitutions_nonsynonymous'], ascending=True)  # sorting values so they will match the predicted score order
)
fitness_scores

Unnamed: 0.1,Unnamed: 0,genotype_ID,count,NT_substitutions,NT_substitutions_count,NT_insertions,NT_deletions,NT_insertion_length,NT_deletion_length,AA_substitutions_nonsynonymous,...,mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I,mean_enrichment_score_TrpB-enrichment-indole_noTrp.400I,mean_enrichment_score_TrpB-enrichment-analog_5AzI-1,mean_enrichment_score_TrpB-enrichment-analog_5AzI-10,mean_enrichment_score_TrpB-enrichment-analog_5AzI-100,mean_enrichment_score_TrpB-enrichment-analog_5MeOI-1,mean_enrichment_score_TrpB-enrichment-analog_5MeOI-10,mean_enrichment_score_TrpB-enrichment-analog_5MeOI-100,NT_muts_of_interest,AA_muts_of_interest
1622,4456,4752,1,"C36T, C48T, G58A, G162A, G165A, C186T, C195T, ...",28,,,0,0,"A20T, E214K, V245A, L267S, I271V, Q288R",...,-2.123401,-3.617212,0.015387,0.823793,1.462372,-1.266429,0.635984,0.558828,none,none
6026,16808,65,1,"G33A, C59T, T112A, G148A, T219C, G276A, T303C,...",21,,,0,0,"A20V, W38R, G50S, A118T, V142I, F331L",...,-2.559375,-1.615823,,,,,,,none,none
709,1998,2138,1,"G33A, A69G, C138T, T150G, G165A, G172A, C186T,...",25,,,0,0,"A58T, G158D, V250I, H262Y, I271V, T292S",...,-15.996682,-9.616954,-3.433617,-4.720690,-2.486782,-3.872552,-3.915236,-2.178714,none,none
31261,65910,197,1,"G21A, C24T, G63A, A128G, C129T, A174G, G265A, ...",27,,,0,0,"D43G, G89S, R171H, A321T, R336H, K359E",...,-6.940945,-16.916963,-4.885980,-2.236635,-2.555700,-4.814089,-3.314509,-1.489141,none,none
6031,16820,77,1,"C24T, G26A, G58A, C138T, G172A, G176C, G360A, ...",21,,,0,0,"G9D, A20T, A58T, R59P, A233T, F240S",...,-16.921055,-25.481235,-4.400589,-2.095781,-2.352674,-2.593603,-4.245725,-2.685773,none,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31136,65662,19345,1,"G37A, C40T, G42A, A44G, G58A, C70T, C77A, A128...",58,,,0,0,"V13M, P14S, E15G, A20T, A26D, D43G, P52T, F57Y...",...,-12.988290,-9.147510,-4.486402,-2.068880,-2.060324,-2.140326,-3.601725,-2.216853,none,none
46347,100162,19739,1,"G37A, C40T, G42A, G54A, G58A, G76A, C107T, T12...",59,,,0,0,"V13M, P14S, M18I, A20T, A26T, S36F, F41Y, R46Y...",...,-14.962748,-7.983519,-5.660954,-3.216728,-4.471864,-3.219503,-3.665834,-2.977119,none,none
31159,65714,19401,1,"G33A, C36T, C40T, C55T, G58A, G81A, A104G, T12...",61,,,0,0,"P14S, P19S, A20T, E35G, F41Y, R46Y, D47N, P52T...",...,-19.817221,-15.718157,-5.079873,-4.813121,-3.204259,-5.007982,-3.698147,-1.930562,none,none
21379,47726,16681,1,"G21A, G33A, C36T, C55T, G58A, T122A, G132A, C1...",52,,,0,0,"P19S, A20T, F41Y, R46Y, D47N, P52T, F57Y, R60H...",...,-13.560221,-18.396763,-5.633955,-3.139687,-1.968127,-2.812797,-5.055442,-2.320643,none,none


Now I have to add in the fitness and prediction score for TmTriple, which was not included in the above dataset because it was spiked into the library after the genotyping NGS but before the fitness assay NGS. I'll also add in one bit of information that I plan on using, which is the number of nonsynonymous AA mutations. All other information that is in the genotypes output of the maple pipeline I won't bother adding in at this point.

In [7]:
tmtriple_scores = pd.read_csv('../enrichment/TrpB-enrichment-indole_enrichment-scores-mean.csv')

# only keep rows where UMI column is TmTriple
tmtriple_scores = (tmtriple_scores[tmtriple_scores['UMI'] == 'TmTriple'].reset_index(drop=True)
                   .pivot(index='UMI', columns='sample', values='mean_enrichment_score')
                   .reset_index(drop=True))
tmtriple_scores = (tmtriple_scores.rename({col:'mean_enrichment_score_TrpB-enrichment-indole_'+col for col in tmtriple_scores.columns}, axis=1)
                     .assign(AA_substitutions_nonsynonymous_count = 3))
tmtriple_scores

sample,mean_enrichment_score_TrpB-enrichment-indole_Trp+,mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I,mean_enrichment_score_TrpB-enrichment-indole_noTrp.400I,AA_substitutions_nonsynonymous_count
0,-0.040913,-17.645965,-16.245238,3


In [8]:
tmtriple_prediction = pd.read_csv('TmTriple.csv').rename(columns={'avg_score':'tranceptEVE_score'})
tmtriple_prediction

Unnamed: 0,mutated_sequence,avg_score_L_to_R,avg_score_R_to_L,tranceptEVE_score,mutant
0,MKGYFGPYGGQYVPEILMGALEELEAAYEEIMKDESFWKEFNDLLR...,-0.015803,-0.016603,-0.016203,P19G:I69V:T292S


In [9]:
tmtriple_combined = pd.concat([tmtriple_scores, tmtriple_prediction], axis=1, sort=False)
tmtriple_combined

Unnamed: 0,mean_enrichment_score_TrpB-enrichment-indole_Trp+,mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I,mean_enrichment_score_TrpB-enrichment-indole_noTrp.400I,AA_substitutions_nonsynonymous_count,mutated_sequence,avg_score_L_to_R,avg_score_R_to_L,tranceptEVE_score,mutant
0,-0.040913,-17.645965,-16.245238,3,MKGYFGPYGGQYVPEILMGALEELEAAYEEIMKDESFWKEFNDLLR...,-0.015803,-0.016603,-0.016203,P19G:I69V:T292S


combining the fitness scores, their predictions, and this TmTriple row

In [10]:
combined = pd.concat([fitness_scores.reset_index(drop=True), predictions.reset_index(drop=True).drop(columns='AA_substitutions_nonsynonymous_count')], axis=1, sort=False)
combined = (pd.concat([combined, tmtriple_combined], axis=0, sort=False)
            .sort_values(by='AA_substitutions_nonsynonymous_count', ascending=True))
combined

Unnamed: 0.1,Unnamed: 0,genotype_ID,count,NT_substitutions,NT_substitutions_count,NT_insertions,NT_deletions,NT_insertion_length,NT_deletion_length,AA_substitutions_nonsynonymous,...,mean_enrichment_score_TrpB-enrichment-analog_5MeOI-1,mean_enrichment_score_TrpB-enrichment-analog_5MeOI-10,mean_enrichment_score_TrpB-enrichment-analog_5MeOI-100,NT_muts_of_interest,AA_muts_of_interest,mutated_sequence,avg_score_L_to_R,avg_score_R_to_L,tranceptEVE_score,mutant
0,,,,,,,,,,,...,,,,,,MKGYFGPYGGQYVPEILMGALEELEAAYEEIMKDESFWKEFNDLLR...,-0.015803,-0.016603,-0.016203,P19G:I69V:T292S
5,32978.0,121.0,1.0,"G33A, C36T, C136A, T201C, G228A, C353T, C361T,...",22.0,,,0.0,0.0,"R46S, A118V, V127A, L267Q, F331L, K354E",...,-2.042110,-2.893679,0.670953,none,none,MKGYFGPYGGQYVPEILMPALEELEAAYEEIMKDESFWKEFNDLLS...,-0.017304,-0.017713,-0.017508,R46S:A118V:V127A:L267Q:F331L:K354E
4,16820.0,77.0,1.0,"C24T, G26A, G58A, C138T, G172A, G176C, G360A, ...",21.0,,,0.0,0.0,"G9D, A20T, A58T, R59P, A233T, F240S",...,-2.593603,-4.245725,-2.685773,none,none,MKGYFGPYDGQYVPEILMPTLEELEAAYEEIMKDESFWKEFNDLLR...,-0.106535,-0.101176,-0.103855,G9D:A20T:A58T:R59P:A233T:F240S
0,4456.0,4752.0,1.0,"C36T, C48T, G58A, G162A, G165A, C186T, C195T, ...",28.0,,,0.0,0.0,"A20T, E214K, V245A, L267S, I271V, Q288R",...,-1.266429,0.635984,0.558828,none,none,MKGYFGPYGGQYVPEILMPTLEELEAAYEEIMKDESFWKEFNDLLR...,-0.030438,-0.031293,-0.030865,A20T:E214K:V245A:L267S:I271V:Q288R
2,1998.0,2138.0,1.0,"G33A, A69G, C138T, T150G, G165A, G172A, C186T,...",25.0,,,0.0,0.0,"A58T, G158D, V250I, H262Y, I271V, T292S",...,-3.872552,-3.915236,-2.178714,none,none,MKGYFGPYGGQYVPEILMPALEELEAAYEEIMKDESFWKEFNDLLR...,-0.060673,-0.060771,-0.060722,A58T:G158D:V250I:H262Y:I271V:T292S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46392,100162.0,19739.0,1.0,"G37A, C40T, G42A, G54A, G58A, G76A, C107T, T12...",59.0,,,0.0,0.0,"V13M, P14S, M18I, A20T, A26T, S36F, F41Y, R46Y...",...,-3.219503,-3.665834,-2.977119,none,none,MKGYFGPYGGQYMSEILIPTLEELETAYEEIMKDEFFWKEYNDLLY...,-0.291574,-0.286946,-0.289260,V13M:P14S:M18I:A20T:A26T:S36F:F41Y:R46Y:D47N:P...
46387,32788.0,17407.0,1.0,"G33A, C36T, C55T, G58A, A97G, T122A, G132A, C1...",51.0,,,0.0,0.0,"P19S, A20T, K33E, F41Y, R46Y, D47N, F57Y, R60H...",...,-3.612615,-3.003068,-2.668755,none,none,MKGYFGPYGGQYVPEILMSTLEELEAAYEEIMEDESFWKEYNDLLY...,-0.340041,-0.339786,-0.339914,P19S:A20T:K33E:F41Y:R46Y:D47N:F57Y:R60H:G66A:R...
46393,65714.0,19401.0,1.0,"G33A, C36T, C40T, C55T, G58A, G81A, A104G, T12...",61.0,,,0.0,0.0,"P14S, P19S, A20T, E35G, F41Y, R46Y, D47N, P52T...",...,-5.007982,-3.698147,-1.930562,none,none,MKGYFGPYGGQYVSEILMSTLEELEAAYEEIMKDGSFWKEYNDLLY...,-0.326805,-0.316314,-0.321559,P14S:P19S:A20T:E35G:F41Y:R46Y:D47N:P52T:F57Y:R...
46395,98456.0,17809.0,1.0,"T34C, G37A, T53C, G58A, C136A, G139A, C161T, G...",51.0,,,0.0,0.0,"Y12H, V13M, M18T, A20T, R46S, D47N, P54L, R59H...",...,,,,none,none,MKGYFGPYGGQHMPEILTPTLEELEAAYEEIMKDESFWKEFNDLLS...,-0.310380,-0.309777,-0.310078,Y12H:V13M:M18T:A20T:R46S:D47N:P54L:R59H:R68H:I...


In [11]:
combined.sort_values('AA_substitutions_nonsynonymous')[['NT_substitutions', 'AA_substitutions_nonsynonymous', 'mutant']]

Unnamed: 0,NT_substitutions,AA_substitutions_nonsynonymous,mutant
1257,"G21A, C195T, T198A, G352A, C353T, T365C, C402T...","A118I, F122S, R171C, V188I, N199S, G256C, K270...",A118I:F122S:R171C:V188I:N199S:G256C:K270R:W286...
72,"G33A, G42A, G51A, T126C, C258T, G276A, G352A, ...","A118I, F122S, R171H, E214Q, D242N, Y273N, Y319...",A118I:F122S:R171H:E214Q:D242N:Y273N:Y319D:R336...
2542,"G33A, G39A, G42A, G72A, G132A, T153C, T201C, T...","A118I, F122S, V142I, V153I, R171S, T175I, A234...",A118I:F122S:V142I:V153I:R171S:T175I:A234G:T259...
9622,"G33A, G42A, G51A, T126C, C195T, T201C, G228A, ...","A118I, N167D, T175N, Y181H, V183A, I196V, G228...",A118I:N167D:T175N:Y181H:V183A:I196V:G228D:I241...
2543,"G51A, T126C, C171T, G243A, G352A, G368A, A453G...","A118T, G123D, R171S, V183M, E214Q, V250I, G260...",A118T:G123D:R171S:V183M:E214Q:V250I:G260D:L282...
...,...,...,...
21397,"T22C, G33A, T34C, C40T, G42A, G58A, G63C, C70T...","Y8H, Y12H, P14S, A20T, L45M, R59S, R60H, A104V...",Y8H:Y12H:P14S:A20T:L45M:R59S:R60H:A104V:A118T:...
44226,"T22C, T34C, G37A, G58A, T126G, T150A, G162C, G...","Y8H, Y12H, V13M, A20T, N42K, A58T, R68H, L93M,...",Y8H:Y12H:V13M:A20T:N42K:A58T:R68H:L93M:A118V:G...
13118,"T22C, G33A, G42A, T82C, T110A, T159G, G271A, G...","Y8H, Y28H, F37Y, V91I, A94T, A118V, L147M, A16...",Y8H:Y28H:F37Y:V91I:A94T:A118V:L147M:A165V:N167...
33483,"T22C, C24A, C30T, G33A, G37A, C40T, G42A, A98T...","Y8Q, V13M, P14S, K33I, F37Y, R101H, L147M, V15...",Y8Q:V13M:P14S:K33I:F37Y:R101H:L147M:V152I:N167...


In [12]:
combined.columns

Index(['Unnamed: 0', 'genotype_ID', 'count', 'NT_substitutions',
       'NT_substitutions_count', 'NT_insertions', 'NT_deletions',
       'NT_insertion_length', 'NT_deletion_length',
       'AA_substitutions_nonsynonymous', 'AA_substitutions_synonymous',
       'AA_substitutions_nonsynonymous_count', 'AA_insertions', 'AA_deletions',
       'barcode(s)', 'barcode_group', 'NT_PaCMAP1', 'NT_PaCMAP2', 'AA_PaCMAP1',
       'AA_PaCMAP2', 'mean_enrichment_score_TrpB-enrichment-indole_Trp+',
       'mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I',
       'mean_enrichment_score_TrpB-enrichment-indole_noTrp.400I',
       'mean_enrichment_score_TrpB-enrichment-analog_5AzI-1',
       'mean_enrichment_score_TrpB-enrichment-analog_5AzI-10',
       'mean_enrichment_score_TrpB-enrichment-analog_5AzI-100',
       'mean_enrichment_score_TrpB-enrichment-analog_5MeOI-1',
       'mean_enrichment_score_TrpB-enrichment-analog_5MeOI-10',
       'mean_enrichment_score_TrpB-enrichment-analog_5MeOI-100',

Save the combined dataframe as a csv

In [13]:
(combined[['genotype_ID', 'NT_substitutions', 'NT_substitutions_count', 'AA_substitutions_nonsynonymous', 'AA_substitutions_nonsynonymous_count',
            'mean_enrichment_score_TrpB-enrichment-indole_Trp+',
            'mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I',
            'mean_enrichment_score_TrpB-enrichment-indole_noTrp.400I',
            'mutated_sequence', 'tranceptEVE_score']]
            .to_csv('tmtrpb_fitness_and_predicted_scores.csv', index=False))

Compute the r2 value between fitness scores and predicted scores

In [14]:
pearsonr(combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'], combined['tranceptEVE_score'])

PearsonRResult(statistic=0.11639294388607747, pvalue=1.2192589519911162e-139)

In [15]:
spearmanr(combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'], combined['tranceptEVE_score'])

SignificanceResult(statistic=0.15142179000598086, pvalue=5.057259910760301e-236)

In [16]:
filtered = combined[combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'] > -5]
pearsonr(filtered['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'],
         filtered['tranceptEVE_score'])

PearsonRResult(statistic=0.03314833851755391, pvalue=0.011653270295195037)

In [17]:
prediction_v_score = (combined.hvplot.hexbin(x='mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I', y='tranceptEVE_score', logz=True, cmap='bmy_r', height=500, width=600, gridsize=40) *
            tmtriple_combined.hvplot.scatter(x='mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I', y='tranceptEVE_score', c='black')
    )
export_svg_plots([prediction_v_score], 'prediction_v_score.html')
prediction_v_score

Is the correlation stronger if you set all scores less than -5.5 to -5.5?

In [18]:
# get correlation between score with a floor of -5.5 and trancepteve score
# set mean score values less than -5.5 to -5.5 with clip
pearsonr(combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'].clip(lower=-5.5), combined['tranceptEVE_score'])


PearsonRResult(statistic=0.30299314902762997, pvalue=0.0)

What about for spearmans r?

In [19]:
spearmanr(combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'].clip(lower=-5.5), combined['tranceptEVE_score'])

SignificanceResult(statistic=0.3497549577403552, pvalue=0.0)

What are the quartiles for tranceptEVE score?

In [20]:
# compute trancepteve score quartiles
lower_predicted_Q, _, upper_predicted_Q = combined['tranceptEVE_score'].quantile([0.25, 0.5, 0.75])
combined['tranceptEVE_score'].quantile([0.25, 0.5, 0.75])

0.25   -0.193802
0.50   -0.160501
0.75   -0.129791
Name: tranceptEVE_score, dtype: float64

How many low function and high function sequences are there in the lower and upper quartile of transceptEVE scores?

In [21]:
# number of sequences with trancepteve score in lower quartile and with high function (mean enrichment score > -)
lower_predicted_high = len(combined[(combined['tranceptEVE_score'] <= lower_predicted_Q) & (combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'] > -5)])
lower_predicted_high_fraction = lower_predicted_high / len(combined)

# "" upper quartile "" high ""
upper_predicted_high = len(combined[(combined['tranceptEVE_score'] > upper_predicted_Q) & (combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'] > -5)])
upper_predicted_high_fraction = upper_predicted_high / len(combined)

# etc
lower_predicted_low = len(combined[(combined['tranceptEVE_score'] <= lower_predicted_Q) & (combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'] <= -5)])
lower_predicted_low_fraction = lower_predicted_low / len(combined)

upper_predicted_low = len(combined[(combined['tranceptEVE_score'] > upper_predicted_Q) & (combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'] <= -5)])
upper_predicted_low_fraction = upper_predicted_low / len(combined)

# pretty print
print(f'Total (fraction) of sequences with\nlow predicted score and high function: {lower_predicted_high} ({lower_predicted_high_fraction}\nhigh predicted score and high function: {upper_predicted_high} ({upper_predicted_high_fraction})\nlow predicted score and low function: {lower_predicted_low} ({lower_predicted_low_fraction})\nhigh predicted score and low function: {upper_predicted_low} ({upper_predicted_low_fraction})')

Total (fraction) of sequences with
low predicted score and high function: 79 (0.0017026962950190746
high predicted score and high function: 3514 (0.0757376554518611)
low predicted score and low function: 11521 (0.24831346854322478)
high predicted score and low function: 8085 (0.17425695626872428)


In [22]:
prediction_v_score_filtered = filtered.hvplot.hexbin(x='mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I', y='tranceptEVE_score', logz=True, cmap='bmy_r', height=500, width=600, gridsize=40
    )
export_svg_plots([prediction_v_score_filtered], 'prediction_v_score_filtered.html')
prediction_v_score_filtered

In [23]:
pearsonr(combined['mean_enrichment_score_TrpB-enrichment-indole_noTrp.25I'], combined['tranceptEVE_score'])

PearsonRResult(statistic=0.11639294388607747, pvalue=1.2192589519911162e-139)

In [24]:
prediction_v_AAmuts = (combined.hvplot.hexbin(x='AA_substitutions_nonsynonymous_count', y='tranceptEVE_score', logz=True, cmap='bmy_r', gridsize=40, height=500, width=600, xlim=(0,40)) *
             tmtriple_combined.hvplot.scatter(x='AA_substitutions_nonsynonymous_count', y='tranceptEVE_score', c='black'))
print(pearsonr(combined['AA_substitutions_nonsynonymous_count'], combined['tranceptEVE_score']))
prediction_v_AAmuts

PearsonRResult(statistic=-0.7649658814787735, pvalue=0.0)


In [25]:
export_svg_plots([prediction_v_AAmuts], 'prediction_v_AAmuts.html')

In [26]:
pearsonr(combined['tranceptEVE_score'], combined['AA_substitutions_nonsynonymous_count'])

PearsonRResult(statistic=-0.7649658814787735, pvalue=0.0)

In [27]:
(filtered
    .hvplot.hexbin(x='AA_substitutions_nonsynonymous_count', y='tranceptEVE_score', logz=True, cmap='bmy_r', gridsize=40, height=500, width=600, xlim=(0,40)))

In [28]:
pearsonr(filtered['tranceptEVE_score'], filtered['AA_substitutions_nonsynonymous_count'])

PearsonRResult(statistic=-0.7525039454367222, pvalue=0.0)

In [29]:
# change combined dtype of AA_substitutions_nonsynonymous_count to float
combined = combined.astype({'AA_substitutions_nonsynonymous_count': 'float64'})