In [1]:
import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

In [2]:
path_to_output = '/home/bsccns/Documents/PhD/consensus/benchmark/consensus_docking/unbound_high_affinity/output/'
programs_list = ['frodock', 'ftdock','patchdock','piper','rosetta']

## Violin plot showing filtering efficacy

In [3]:
path_to_rmsd_folder = os.path.join(path_to_output, 'analysis', 'rmsd')
path_to_filter_folder = os.path.join(path_to_output, 'filter')
near_native_threshold = 5.

filtered_ids_list = []
for program in programs_list:
    filter_file = os.path.join(path_to_filter_folder, f'{program}_filter.txt')
    df = pd.read_csv(filter_file, names=['ids','distance'], sep=' ')
    df[['ids','dumm']] = df['ids'].str.split('.',expand=True)
    filtered_ids_list += df['ids'].tolist()

rmsd_df = pd.DataFrame()
for program in programs_list:
    rmsd_file = os.path.join(path_to_rmsd_folder, f'rmsd_{program}.csv')
    df = pd.read_csv(rmsd_file, names=['ids','rmsd'])
    rmsd_df = pd.concat([rmsd_df,df], axis=0)
rmsd_df[['program','dumm']] = rmsd_df['ids'].str.split('_',expand=True)
rmsd_df[['ids','dumm']] = rmsd_df['ids'].str.split('.',expand=True)
rmsd_df.pop('dumm')
rmsd_df['isFiltered'] = np.where(rmsd_df['ids'].isin(filtered_ids_list), True, False)
rmsd_df['isNearNative'] = np.where(rmsd_df['rmsd'] <= near_native_threshold, True, False)
rmsd_df

Unnamed: 0,ids,rmsd,program,isFiltered,isNearNative
0,frodock_105483,94.507146,frodock,False,False
1,frodock_42803,56.252583,frodock,False,False
2,frodock_119983,105.194364,frodock,False,False
3,frodock_126759,29.254339,frodock,True,False
4,frodock_98859,65.873063,frodock,False,False
...,...,...,...,...,...
70357,rosetta_61392,36.674709,rosetta,False,False
70358,rosetta_01157,74.863291,rosetta,False,False
70359,rosetta_58035,73.326400,rosetta,False,False
70360,rosetta_65585,73.435560,rosetta,False,False


In [None]:
fig = go.Figure()

fig.add_trace(go.Violin(x=rmsd_df['program'][ rmsd_df['isFiltered'] == True ],
                        y=rmsd_df['rmsd'][ rmsd_df['isFiltered'] == True ],
                        legendgroup='Filtered', scalegroup='Yes', name='Filtered',
                        side='negative',
                        line_color='lightseagreen',
                        points=False)
             )
fig.add_trace(go.Violin(x=rmsd_df['program'],
                        y=rmsd_df['rmsd'],
                        legendgroup='Not filtered', scalegroup='No', name='Not filtered',
                        side='positive',
                        line_color='mediumpurple',
                        points=False)
             )

fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.update_layout(yaxis_range=[0,120],xaxis_range=[-0.5,4.5])

fig.update_layout(
    title="Comparison of filtered and non-filtered poses L-RMSD",
    xaxis_title="Program",
    yaxis_title="L-RMSD (Å)",
    font=dict(
        family="Arial, monospace",
        size=18,
        color="Black"
    )
)

fig.write_image(f"{path_to_output}/analysis/violin_rmsd_distribution_per_program.png")
# Interactive version
#fig.write_html(f"{path_to_output}/analysis/violin_rmsd_distribution_per_program.html")
fig.show()

In [None]:
rmsd_df['dumm'] = 'Unbound High Affinity'

fig = go.Figure()

fig.add_trace(go.Violin(x=rmsd_df['dumm'][ rmsd_df['isFiltered'] == True ],
                        y=rmsd_df['rmsd'][ rmsd_df['isFiltered'] == True ],
                        legendgroup='Filtered', scalegroup='Yes', name='Filtered',
                        side='negative',
                        line_color='lightseagreen',
                        points=False)
             )
fig.add_trace(go.Violin(x=rmsd_df['dumm'],
                        y=rmsd_df['rmsd'],
                        legendgroup='Not filtered', scalegroup='No', name='Not filtered',
                        side='positive',
                        line_color='mediumpurple',
                        points=False)
             )

fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.update_layout(yaxis_range=[0,120],xaxis_range=[-0.5,0.5])

fig.update_layout(
    title="Comparison of filtered and non-filtered poses L-RMSD",
    xaxis_title="System",
    yaxis_title="L-RMSD (Å)",
    font=dict(
        family="Arial, monospace",
        size=18,
        color="Black"
    )
)

fig.write_image(f"{path_to_output}/analysis/violin_rmsd_distribution_full_dataset.png")
# Interactive version
#fig.write_html(f"{path_to_output}/analysis/violin_rmsd_distribution_full_dataset.html")
fig.show()
rmsd_df.pop('dumm')

In [6]:
nnc_serie = rmsd_df.value_counts(subset=['program', 'isNearNative','isFiltered'],ascending=True)
near_native_counts_filt = nnc_serie.to_frame(name='Counts')
near_native_counts_filt.reset_index(inplace=True)
near_native_counts_filt = near_native_counts_filt[near_native_counts_filt['isFiltered'] ==True]


nnc_serie = rmsd_df.value_counts(subset=['program', 'isNearNative'],ascending=True)
near_native_counts_nofilt = nnc_serie.to_frame(name='Counts')
near_native_counts_nofilt.reset_index(inplace=True)


n_filtered_poses = near_native_counts_filt['Counts'][near_native_counts_filt['isFiltered']==True].sum()
n_total_poses = len(rmsd_df)

near_native_counts_filt['Near Native Percentage'] = near_native_counts_filt['Counts'] * 100 / n_filtered_poses
near_native_counts_nofilt['Near Native Percentage'] = near_native_counts_nofilt['Counts'] * 100 / n_total_poses

near_native_counts_nofilt['isFiltered'] = False

near_native_counts = pd.merge(near_native_counts_filt,near_native_counts_nofilt, how='outer')

mask = near_native_counts.applymap(type) != bool
d = {True: 'Filtered', False: 'Not Filtered'}
near_native_counts = near_native_counts.where(mask, near_native_counts.replace(d))

In [None]:
fig = px.bar(near_native_counts[near_native_counts['isNearNative'] == 'Filtered'], x='program',
             y='Near Native Percentage', title="Enrichment of Near Native Poses by filtering",
             color='isFiltered', barmode='group', color_discrete_map={'False': 'mediumpurple', 'True': 'lightseagreen'}, opacity=0.5)#, height=400)

fig.update_layout(
    xaxis_title="Program",
    font=dict(
        family="Arial, monospace",
        size=18,
        color="Black"
    )
)

fig.update_layout(legend_title_text='')
fig.update_layout(yaxis_range=[0,0.0575])

fig.write_image(f"{path_to_output}/analysis/barplot_near_native_percentage_per_program.png")

fig.show()

In [8]:
near_native_counts

Unnamed: 0,program,isNearNative,isFiltered,Counts,Near Native Percentage
0,rosetta,Filtered,Filtered,1,0.000578
1,ftdock,Filtered,Filtered,3,0.001734
2,patchdock,Filtered,Filtered,23,0.013292
3,frodock,Filtered,Filtered,44,0.025429
4,piper,Filtered,Filtered,73,0.042188
5,rosetta,Not Filtered,Filtered,16764,9.688269
6,patchdock,Not Filtered,Filtered,27747,16.035577
7,ftdock,Not Filtered,Filtered,30154,17.426633
8,piper,Not Filtered,Filtered,48717,28.154582
9,frodock,Not Filtered,Filtered,49508,28.611718


In [9]:
rmsd_df[rmsd_df.isNearNative==True].sort_values('rmsd')

Unnamed: 0,ids,rmsd,program,isFiltered,isNearNative
117032,frodock_123931,0.936967,frodock,True,True
129095,frodock_123930,1.139134,frodock,True,True
55690,patchdock_21020,1.437627,patchdock,True,True
28281,patchdock_19200,1.468920,patchdock,True,True
42811,piper_21,1.549657,piper,True,True
...,...,...,...,...,...
32805,piper_534,4.918875,piper,True,True
131770,frodock_124567,4.921012,frodock,True,True
65732,piper_7995,4.967187,piper,True,True
49915,patchdock_18162,4.984303,patchdock,True,True
