# FXa protein: Conformations Metadata
## Data load and preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%run ../modules/run_or_load_decorator.py
%run ../modules/plotting_metrics.py

In [2]:
%run ./1_Helper_functions.ipynb

## Load the data
### Ensemble docking scores

In [3]:
protein_name='fxa'
file_name = '../../../FXa/ANALISIS/6_Machine_Learning_Models/' + \
'df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
y_true_merged = X_merged_dksc['activity']
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

# Simplify the names
X = X_merged_dksc.values
y = y_true_merged.values


R_a = round(y.sum() / len(y), 2)
print(X.shape)
print(y.sum())
print('Ra =', R_a)

(6233, 136)
300
Ra = 0.05


### Protein conformations metadata

In [4]:
path_to_file = '../../../FXa/ANALISIS/1_Fetching_and_generating_data/TABLA_MTDATA_FXA_136_crys_LIGS_INFO.json'
df_prot = pd.read_json(path_to_file)

#### Protein volume

In [5]:
# Volume using povme
# Get it manually to include SASA
df_pocket = pd.read_csv('../../../FXa/FILES/CRYSTALS/TRAJ_CRYSTALS_PDB/povme/FXA_POCKET_RES_align/res_output.txt', sep='|', 
            skiprows=312, header=None, skipfooter=3, engine='python',
            names=['pdb_id', 'volumen', 'surf_area'], index_col=0)

# Include it to the final table
df_pocket.index = df_prot.index
# merge both
df_pocket = df_prot.merge(df_pocket, how='inner', left_index=True, right_index=True)

FileNotFoundError: [Errno 2] No such file or directory: '../../../FXa/FILES/CRYSTALS/TRAJ_CRYSTALS_PDB/povme/FXA_POCKET_RES_align/res_output.txt'

#### Apo or holo conformation
Only with respect to the defined pocket

In [None]:
df_pocket['Apo'] = ['holo' if i > 0 else 'apo' for i in df_pocket.NumLigs] 

# How many of each are there?
df_pocket['Apo'].value_counts()

#### If the protein is bound to other proteic entities 

In [None]:
df_pocket['Single Entity'] = ['bound' if i > 1 else 'single' for i in df_pocket.Entities] 

#### Fill Inhibitor mass column with 0s

In [None]:
df_pocket['Inhib_mass'] = pd.to_numeric(df_pocket['Inhib_mass']).fillna(0)

#### Fill Resolution values, if needed

In [None]:
df_pocket['Resolution'] = pd.to_numeric(df_pocket['Resolution']).fillna(0)

#### Get the performance measures and append them

In [None]:
# AUC ROC
metric_params = {'metric_name': 'roc_auc'}
roc_auc = PlotMetric(y, X_merged_dksc.to_dict('list'),
                decreasing=True).format_metric_results(rounded=5, **metric_params)

# Enrichment Factor Ra=12
metric_params = {'metric_name': 'ef', 'fraction': 0.12, 'method':'normalized'}
nef_012 = PlotMetric(y, X_merged_dksc.to_dict('list'),
                decreasing=True).format_metric_results(rounded=5, **metric_params)

# Merge all dataframes
df_merged = df_pocket.join([roc_auc, nef_012], how='inner')

### Keep and rename important columns

In [None]:
# Rename some columns
last_vars = ['Resolution', 'Inhib_mass', 'volumen', 'surf_area', 'Apo', 'Single Entity', 'ROC AUC', 'EF']
new_names = ['Resolution', 'Inhib. MW', 'Pk. Volume', 'Pk. Surf. Area', 'Apo', 'Single Entity', 'AUC-ROC', 'NEF']
df_final = df_merged[last_vars].rename(columns=dict(zip(last_vars, new_names)))
df_final.columns

In [None]:
# Save the dataframe
@run_or_load
def save_df(filename, df):
    return df

# Save it
df_final = save_df(f'./TABLE_Confs_Features_and_performances_{protein_name}.pkl', df_final)

# Correlations

In [None]:
from scipy.stats import pearsonr, spearmanr
import pandas as pd

def calculate_pvalues(df, func=spearmanr):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(func(df[r], df[c])[1], 4)
    return pvalues

def corrfunc(x, y, func=spearmanr, **kws):
    r, _ = func(x, y)
    ax = plt.gca()
    ax.annotate(r"$\rho$ = {:.2f}".format(r), fontsize=14,
                xy=(.1, .9), xycoords=ax.transAxes)

In [None]:
df_final.columns

In [None]:
display(df_final.corr(method='spearman'))
display(calculate_pvalues(df_final))

In [None]:
g = sns.pairplot(df_final, diag_kind="hist", corner=True,  
                 plot_kws = {'alpha': 0.4, 's': 30, 'edgecolor': 'k', 'linewidth': 0})
g.map_lower(sns.kdeplot, levels=8, color=".5", linewidths=1)
g.map_diag(sns.kdeplot, color='.5')
g.map_lower(corrfunc)
plt.show()

In [None]:
g = sns.pairplot(df_final, diag_kind="hist", corner=True, hue='Single Entity',
                 plot_kws = {'alpha': 0.4, 's': 30, 'edgecolor': 'k', 'linewidth': 0})
g.map_lower(sns.kdeplot, levels=8, color=".5", linewidths=1)
g.map_diag(sns.kdeplot)
# g.map_lower(corrfunc)
plt.show()