## Analysis of the Distribution of the Correct Hits ##

In [None]:
import os
import pandas as pd
import numpy as np
import math

%matplotlib inline
import matplotlib.pyplot as plt
from plotly import __version__
print(__version__)

import plotly.express as px
import plotly.graph_objects as go
from plotly.colors import n_colors
from plotly.subplots import make_subplots


from plotly.offline import download_plotlyjs, init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)  #connect with Javascript

import numpy as np
import scipy.special

list_protocols="""
'mixed_ali-delim-delim',
'single_pep-delim-delim',
'unpaired_ali-delim-delim',
'mixed_ali-fl-fl',
'mixed_ali-delim-fl',
'unpaired_ali-delim-fl',
'mixed_ali-delim-200',
'mixed_ali-delim-100',
'single_pep-delim-100',
'single_pep-delim-200'
"""


### Recover the environment variable in the config file ###

In [None]:
import configparser

#script_path = os.path.abspath(__file__)
script_dir = "../" #os.path.dirname(os.path.dirname(script_path))

config = configparser.ConfigParser()
config.read(os.path.join(f'{script_dir}', 'config.ini'))

WORKING_DIR = os.path.abspath("../..") #config['DEFAULT']['WORKING_DIR']""
DATA_DIR = os.path.abspath(os.path.join(WORKING_DIR, 'data', ))


CUTMODELS_DIR = os.path.abspath(os.path.join(DATA_DIR, config['DEFAULT']['CUTMODELS_DIR']))
CAPRIEVAL_DIR = os.path.abspath(os.path.join(DATA_DIR, config['DEFAULT']['CAPRIEVAL_DIR']))
REFERENCE_STRUCT_DIR = os.path.abspath(os.path.join(DATA_DIR, config['DEFAULT']['REFERENCE_DIR']))

n_iterations = int(config['DEFAULT']['N_ITERATIONS'])

RESULTS_DIR = os.path.abspath(os.path.join(DATA_DIR, config['DEFAULT']['RESULTS_DIR']))
fglobaloutput_path = os.path.abspath(os.path.join(RESULTS_DIR, config['DEFAULT']['OUTPUT_GLOBAL']))


In [None]:
path = RESULTS_DIR
path_figures = os.path.join(path, 'Figures')
if not os.path.isdir(path_figures):
    os.system(f"mkdir {path_figures}")
DO_DUMP_FIGURES = False

Ncol = 9
result_file = os.path.abspath(os.path.join(RESULTS_DIR, 'Infos_bestModels.out'))
path2file = result_file

list_header = []
with open(path2file) as fin:
    f = fin.readlines()[:Ncol]
    for l in f:
        s = l.split()[-1]
        colname = s.split(':')[1]
        list_header.append(colname)

df = pd.read_table(path2file, sep="\t", header = None, names = list_header, skiprows=Ncol, low_memory=False)
df['pdbonly'] = df['index_pdb'].str.extract(r'\b\d+_(\w+)$', expand = True)

df

In [None]:
list_entries_sorted = [y[1] for y in sorted([[int(x.split('_')[0]), x] for x in list(set(df['index_pdb']))])]
print(list_entries_sorted)
list_conditions = [x for x in list(set(df['protocol']))]
print(list_conditions)

In [None]:
d_extension = {
    'mixed_ali-delim-delim':'',
    'single_pep-delim-delim':'',
    'unpaired_ali-delim-delim':'',
    'mixed_ali-fl-fl':'',
    'mixed_ali-delim-fl':'',
    'unpaired_ali-delim-fl':'',
    'mixed_ali-delim-100':'',
    'mixed_ali-delim-200':'',
    'single_pep-delim-100':'',
    'single_pep-delim-200':'',
    '3scores':'',
    '4scores':'',
    }
l_conditions = list(d_extension.keys())
print(l_conditions)

In [None]:
#
# IF NEEDED : Which PDB to exclude :
#  add to list_exclude elements as <index>_<PDB> 
#

list_exclude = []

df_with_index = df.set_index("index_pdb")
df = df_with_index.drop(list_exclude)
df = df.reset_index()

## Analysis of the length distribution of the different sets of models ##

### Analysis of full length receptors and ligands

In [None]:
df = pd.read_table(path2file, sep="\t", header = None, names = list_header, skiprows=Ncol)
df['pdbonly'] = df['index_pdb'].str.extract(r'\b\d+_(\w+)$', expand = True)

###### Lines to filter out the excluded PDBs:
df_with_index = df.set_index("index_pdb")
df = df_with_index.drop(list_exclude)
df = df.reset_index()
######

dfbestFL = df.loc[(df['protocol']=='mixed_ali-fl-fl') & (df['sorting_method']=='best_by_CombinedSCORE')]
dfbestFL = dfbestFL.sort_values(by = ['SizeModel'], ascending = [True])

fig = go.Figure()
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    plot_bgcolor = "rgba(0,0,0,0)",
    font=dict(
        family='Arial',
    ),
    yaxis=dict(
        title='DockQ of best AF2 model',
        titlefont_size=16,
        tickfont_size=14,
        showline=True, linewidth=1, linecolor='black', mirror=True, 
        showgrid=True, gridwidth=1, gridcolor='LightGrey',#griddash='dash',
        ticks="outside",
        tickson="boundaries",
        ticklen=5,
    ),
    xaxis=dict(
        title='PDB codes sorted by Input Size',
        titlefont_size=16,
        tickfont_size=14,
        showline=True, linewidth=1, linecolor='black', mirror=True, 
        showgrid=False, gridwidth=1, gridcolor='LightGrey',
        ticks="outside",
        tickson="boundaries",
        ticklen=5,
    ),

)
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.15,
    xanchor="left",
    x=0.0
))



df = dfbestFL
hovertext = list()

fig.add_trace(go.Bar(
    x=df['pdbonly'],
    y=df['DockQ'],
    name='Full length of Receptor and IDP (IDP FL mixed MSA)',
    marker_color='DarkBlue',
    text=df['SizeModel'],

))

fig.update_traces(textfont_family="Arial",textfont_color="black", textangle=0, textposition="outside", cliponaxis=False)

figname = os.path.join(path,"AnalysesForFigures","Figures", f"DockQVsPDB_CondFL_BarPlot_SizeModelsSorted.svg")
if DO_DUMP_FIGURES:
    fig.write_image(figname)

fig.show()

### Analysis of delimited receptors versus full-length ligands

In [None]:
df = pd.read_table(path2file, sep="\t", header = None, names = list_header, skiprows=Ncol)
df['pdbonly'] = df['index_pdb'].str.extract(r'\b\d+_(\w+)$', expand = True)

###### Lines to filter out the excluded PDBs:
df_with_index = df.set_index("index_pdb")
df = df_with_index.drop(list_exclude)
df = df.reset_index()
######


dfbestFL = df.loc[(df['protocol']=='mixed_ali-delim-fl') & (df['sorting_method']=='best_by_CombinedSCORE')]
dfbestFL = dfbestFL.sort_values(by = ['SizeModel'], ascending = [True])


fig = go.Figure()
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    plot_bgcolor = "rgba(0,0,0,0)",
    font=dict(
        family='Arial',
    ),
    yaxis=dict(
        title='DockQ of best AF2 model',
        titlefont_size=16,
        tickfont_size=14,
        showline=True, linewidth=1, linecolor='black', mirror=True, 
        showgrid=True, gridwidth=1, gridcolor='LightGrey',#griddash='dash',
        ticks="outside",
        tickson="boundaries",
        ticklen=5,
    ),
    xaxis=dict(
        title='PDB codes sorted by Input Size',
        titlefont_size=16,
        tickfont_size=14,
        showline=True, linewidth=1, linecolor='black', mirror=True, 
        showgrid=False, gridwidth=1, gridcolor='LightGrey',
        ticks="outside",
        tickson="boundaries",
        ticklen=5,
    ),

)
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.15,
    xanchor="left",
    x=0.0
))



df = dfbestFL
hovertext = list()

fig.add_trace(go.Bar(
    x=df['pdbonly'],
    y=df['DockQ'],
    name='Full length of Receptor and IDP (IDP FL mixed MSA)',
    marker_color='DarkBlue',
    text=df['SizeModel'],
))

fig.update_traces(textfont_family="Arial",textfont_color="black", textangle=0, textposition="outside", cliponaxis=False)

figname = os.path.join(path,"AnalysesForFigures","Figures", f"DockQVsPDB_ConddelFL_BarPlot_SizeModelsSorted.svg")
if DO_DUMP_FIGURES:
    fig.write_image(figname)

fig.show()

### Analysis of delimited receptor and ligand

In [None]:
df = pd.read_table(path2file, sep="\t", header = None, names = list_header, skiprows=Ncol)
df['pdbonly'] = df['index_pdb'].str.extract(r'\b\d+_(\w+)$', expand = True)

###### Lines to filter out the excluded PDBs:
df_with_index = df.set_index("index_pdb")
df = df_with_index.drop(list_exclude)
df = df.reset_index()
######


dfbestFL = df.loc[(df['protocol']=='mixed_ali-delim-delim') & (df['sorting_method']=='best_by_CombinedSCORE')]
dfbestFL = dfbestFL.sort_values(by = ['SizeModel'], ascending = [True])


fig = go.Figure()
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    plot_bgcolor = "rgba(0,0,0,0)",
    font=dict(
        family='Arial',
    ),
    yaxis=dict(
        title='DockQ of best AF2 model',
        titlefont_size=16,
        tickfont_size=14,
        showline=True, linewidth=1, linecolor='black', mirror=True, 
        showgrid=True, gridwidth=1, gridcolor='LightGrey',#griddash='dash',
        ticks="outside",
        tickson="boundaries",
        ticklen=5,
    ),
    xaxis=dict(
        title='PDB codes sorted by Input Size',
        titlefont_size=16,
        tickfont_size=14,
        showline=True, linewidth=1, linecolor='black', mirror=True, 
        showgrid=False, gridwidth=1, gridcolor='LightGrey',
        ticks="outside",
        tickson="boundaries",
        ticklen=5,
    ),

)
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.15,
    xanchor="left",
    x=0.0
))



df = dfbestFL
hovertext = list()

fig.add_trace(go.Bar(
    x=df['pdbonly'],
    y=df['DockQ'],
    name='Full length of Receptor and IDP (IDP FL mixed MSA)',
    marker_color='DarkBlue',
    text=df['SizeModel'],
))

fig.update_traces(textfont_family="Arial",textfont_color="black", textangle=0, textposition="outside", cliponaxis=False)

figname = os.path.join(path,"AnalysesForFigures","Figures", f"DockQVsPDB_CondDELIM_BarPlot_SizeModelsSorted.svg")
if DO_DUMP_FIGURES:
    fig.write_image(figname)

fig.show()