In [87]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import table
from collections import Counter
import numpy as np
import scipy as sp
%matplotlib inline

In [166]:
data = pd.read_csv("data_eda.csv", usecols = ["Molecule ChEMBL ID", "Smiles", "Standard Value", 
                                                                "Standard Units", "Target ChEMBL ID", 
                                                                "Standard Relation", "Target Name"])

In [167]:
data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Relation,Standard Value,Standard Units,Target ChEMBL ID,Target Name
0,CHEMBL77814,N#CC(C#N)=CNc1ccc(O)cc1,'=',350000.0,nM,CHEMBL203,Epidermal growth factor receptor erbB1
1,CHEMBL113356,Cn1c(=O)c(-c2c(Cl)cccc2Cl)cc2cnc(NCc3ccccc3)nc21,'>',50000.0,nM,CHEMBL203,Epidermal growth factor receptor erbB1
2,CHEMBL60200,Cc1ccc2c(c1)c(C(=O)Nc1ccccc1)c(SSc1c(C(=O)Nc3c...,'>',100000.0,nM,CHEMBL203,Epidermal growth factor receptor erbB1
3,CHEMBL540096,CCOC(=O)c1ccc(OC(=O)c2ccc(NC(=N)N)cc2)cc1.Cl,'=',1.6,nM,CHEMBL209,Trypsin I
4,CHEMBL53463,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)...,'=',316.23,nM,CHEMBL614725,MIA PaCa-2


In [168]:
data.shape

(26266, 7)

### Drop NA rows

In [169]:
data.dropna(inplace=True)

In [170]:
# Na values
print("Columns which have empty values: \n\n{} ".format(data.isna().sum()))

Columns which have empty values: 

Molecule ChEMBL ID    0
Smiles                0
Standard Relation     0
Standard Value        0
Standard Units        0
Target ChEMBL ID      0
Target Name           0
dtype: int64 


In [171]:
data.shape

(24763, 7)

## Clean by Standard Value and Standard Units

In [172]:
def print_units(df):
    # List Standard units used
    sv_units = set(df["Standard Units"].tolist())
    print("All standard value units, {}".format(str(sv_units)))



In [173]:
print_units(data)

All standard value units, {'/uM', 'ug/g', 'ug.mL-1', 'nM', "10'7nM"}


In [174]:
sv_data =  data[data['Standard Units'].isin(["/uM", "nM"])]


# Check units again
print_units(sv_data)

All standard value units, {'/uM', 'nM'}


In [175]:
def convert_to_nm(r):
    unit = r["Standard Units"]
    factor = conversion[unit]
    r["Standard Value"] = r["Standard Value"] * factor
    r["Standard Units"] = "nM"

    return r


In [176]:
conversion = {"/uM": 1000, "nM": 1}
sv_data = sv_data.apply(convert_to_nm, axis=1)

In [177]:
print_units(sv_data)

All standard value units, {'nM'}


In [178]:
sv_data.shape

(24559, 7)

## Clean by Compounds

In [179]:
## Keep compounds with atleast 5 data points
vc_mol = sv_data['Molecule ChEMBL ID'].value_counts().rename_axis('unique_values').reset_index(name='counts')

vc_mol = vc_mol[vc_mol["counts"] >= 5]
compound_ids = vc_mol["unique_values"].tolist()
compound_ids

['CHEMBL939',
 'CHEMBL553',
 'CHEMBL3353410',
 'CHEMBL554',
 'CHEMBL388978',
 'CHEMBL29197',
 'CHEMBL3545308',
 'CHEMBL1229592',
 'CHEMBL285063',
 'CHEMBL1173655',
 'CHEMBL53463',
 'CHEMBL2011291',
 'CHEMBL607707',
 'CHEMBL1879463',
 'CHEMBL573339',
 'CHEMBL24828',
 'CHEMBL52765',
 'CHEMBL535',
 'CHEMBL1094195',
 'CHEMBL31965',
 'CHEMBL1098120',
 'CHEMBL1091644',
 'CHEMBL1873475',
 'CHEMBL401930',
 'CHEMBL276711',
 'CHEMBL301018',
 'CHEMBL296407',
 'CHEMBL201511',
 'CHEMBL180022',
 'CHEMBL343352',
 'CHEMBL98350',
 'CHEMBL56543',
 'CHEMBL2178352',
 'CHEMBL56393',
 'CHEMBL1762178',
 'CHEMBL3126485',
 'CHEMBL1336',
 'CHEMBL601719',
 'CHEMBL53753',
 'CHEMBL1614712',
 'CHEMBL53711',
 'CHEMBL1421',
 'CHEMBL502835',
 'CHEMBL2172463',
 'CHEMBL1201179',
 'CHEMBL328216',
 'CHEMBL1762148',
 'CHEMBL30973',
 'CHEMBL439259',
 'CHEMBL148674',
 'CHEMBL413',
 'CHEMBL521851',
 'CHEMBL3699588',
 'CHEMBL545315',
 'CHEMBL39337',
 'CHEMBL281300',
 'CHEMBL91867',
 'CHEMBL283682',
 'CHEMBL98',
 'CHEMBL1230609

In [180]:
mol_data =  sv_data[sv_data['Molecule ChEMBL ID'].isin(compound_ids)]
mol_data.shape

(1460, 7)

## Clean by Targets

In [181]:
# Select targets with more than 30 values
vc_targets = mol_data['Target ChEMBL ID'].value_counts().rename_axis('unique_values').reset_index(name='counts')
vc_targets = vc_targets[vc_targets["counts"] >= 30]
target_ids = vc_targets["unique_values"].tolist()
target_ids

['CHEMBL203', 'CHEMBL1957', 'CHEMBL2842', 'CHEMBL614725', 'CHEMBL1955']

In [182]:
target_data =  mol_data[mol_data['Target ChEMBL ID'].isin(target_ids)]
target_data.shape

(1419, 7)

## Clean by Targets and Compounds interactions

In [183]:
target_data['COUNTER_INT'] =1 
group_data = target_data.groupby(['Molecule ChEMBL ID','Target ChEMBL ID'])['COUNTER_INT'].sum()
interaction_count = group_data.value_counts().sort_index().index.tolist()
interaction_vals = group_data.value_counts().sort_index().values.tolist()

interaction_vals

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


[44,
 23,
 11,
 29,
 47,
 21,
 17,
 4,
 3,
 4,
 3,
 2,
 3,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [184]:
target_mol_inter = group_data.to_frame()
counter_values = target_mol_inter[target_mol_inter['COUNTER_INT'] >= 10]
list_interactions = list(set(counter_values.index)) 
a_list = [a_tuple[1:] for a_tuple in list_interactions]

targets = []

for t in set(a_list):
    targets.append(t[0])

print('Targets with greater than 10 interactions are \n\n{} '.format(targets))

Targets with greater than 10 interactions are 

['CHEMBL203', 'CHEMBL614725', 'CHEMBL2842', 'CHEMBL1957'] 


In [185]:
target_mol_data =  target_data[target_data['Target ChEMBL ID'].isin(targets)]
target_mol_data.shape

(1381, 8)