In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import glob
import seaborn as sns
import matplotlib.pylab as plt
import scipy

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools

import oddt
from oddt.scoring.descriptors import *


# Read in all the structures and create autodock vina descirptors:

In [4]:
filenames_ligand = glob.glob('Data/MPro_2021_03/aligned/Mpro-*/*.sdf')
filenames_protein = glob.glob('Data/MPro_2021_03/aligned/Mpro-*/*_apo-desolv.pdb')

allcompounds = pd.read_excel('Data/MPro_2021_03/Mpro_compound_tracker_excel.xlsx')
print(allcompounds.shape)

#Filter out covalent binders:
ignore = ['Ugi', 'piperazine-chloroacetamide']
allcompounds = allcompounds[~allcompounds['Postera series'].isin(ignore)]
print(allcompounds.shape)

#Filter downto only these that have a structure determined:
compounds = pd.Series([f.split('/')[-1].split('_')[0] for f in filenames_protein])
allcompounds['structure'] = allcompounds['Dataset'].isin(compounds)
allcompounds_final = allcompounds[allcompounds['structure'] == True]
allcompounds_final['filename_start'] = ['MPro_2021_03/aligned/' + str(x) for x in allcompounds_final['Dataset']]
allcompounds_final = allcompounds_final.reset_index(drop = True)
allcompounds_final['protein_filename'] = [str(x) + '_0A/' + x.split('/')[-1] +  '_0A_apo-desolv.pdb' for x in allcompounds_final['filename_start']]
allcompounds_final['ligand_filename'] = [str(x) + '_0A/' + x.split('/')[-1] +  '_0A.sdf' for x in allcompounds_final['filename_start']]
allcompounds_final.shape

(3180, 22)
(2943, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  allcompounds_final['filename_start'] = ['MPro_2021_03/aligned/' + str(x) for x in allcompounds_final['Dataset']]


(330, 26)

In [None]:
AV = autodock_vina_descriptor()
AVDs_all = np.zeros((allcompounds_final.shape[0], 6))
for i in range(0, allcompounds_final['ligand_filename'].shape[0]):
    ligand_sdf = next(oddt.toolkit.readfile('sdf', allcompounds_final['ligand_filename'].iloc[i]))
    protein = next(oddt.toolkit.readfile('pdb', allcompounds_final['protein_filename'].iloc[i]))
    protein.protein = True
    AVDs_all[i,:] = AV.build(protein = protein, ligands = ligand_sdf)
AVD_df = pd.DataFrame(AVDs_all)
full_df_AV = pd.concat([allcompounds_final, AVD_df], axis = 1)
full_df_AV.to_csv('autodock_vina_structures.csv', sep = '\t')

# Create also Autodock vina descriptors for all the docked structures:

In [None]:


'''
OV = oddt_vina_descriptor()
OVDs_all = np.zeros((allcompounds_final.shape[0], 12))
for i in range(0, allcompounds_final['ligand_filename'].shape[0]):
    ligand_sdf = next(oddt.toolkit.readfile('sdf', allcompounds_final['ligand_filename'].iloc[i]))
    protein = next(oddt.toolkit.readfile('pdb', allcompounds_final['protein_filename'].iloc[i]))
    protein.protein = True
    OVDs_all[i,:] = OV.build(protein = protein, ligands = ligand_sdf)
OVD_df = pd.DataFrame(OVDs_all)
full_df_OV = pd.concat([allcompounds_final, OVD_df], axis = 1)
full_df_OV.to_csv('oddt_vina_descriptors.csv', sep = '\t')
'''