In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [29]:
# using rdkit convert smiles to fingerprint 
def fingerprints_from_smiles(smiles, radius=2, nbits=1024):
    all_fingerprints = []
    all_indexes = []
    fails = []


    for molecule in smiles.index:
        try:
            mol_smile = smiles[molecule]
            mol = Chem.MolFromSmiles(mol_smile)
            fp = np.array(AllChem.GetMorganFingerprintAsBitVect(mol,radius=radius, nBits=nbits))
            all_fingerprints.append(fp)
            all_indexes.append(molecule)
        except:
            fails.append(smiles[molecule])
            print(molecule)


    col_name = [f'Bit_{i}' for i in range(nbits)]
    col_bits = [list(l) for l in all_fingerprints]
    fingerprints = pd.DataFrame(col_bits, columns=col_name, index=all_indexes)
    return fingerprints, fails

In [30]:
def split_and_save(dataframe, section_n, name):
    shuffled_data = dataframe.sample(frac=1)

    # get fingerprints for smiles

    sections = np.array_split(shuffled_data, section_n)

    for XX,section in enumerate(sections):
        if XX % 25 == 0:
            print('section', XX)
        fingerprints, fails = fingerprints_from_smiles(section['PUBCHEM_EXT_DATASOURCE_SMILES'])
        res = None
        for i in range(len(section)):
            try:
                if section['PUBCHEM_EXT_DATASOURCE_SMILES'].iloc[i] not in list(fails):
                    fp = fingerprints.iloc[i]
                    values = section.iloc[i][1:].dropna().astype(float)
                    mean_value = values.mean()
                    num_values = len(values)
                    fp['Mean_value'] = mean_value
                    fp['Num_values'] = num_values
                    if res is None:
                        res = fp
                    else:
                        res = pd.concat([res, fp], axis=1)
            except:
                print(f'missed on {XX} {i}')

        res = res.T
        res.to_csv(f'new_data/{name}part{XX}.csv', index=False)


In [31]:
def make_double_data(f1, f2, name):
    folder_name = name.split('/')[0]
    os.mkdir('new_data/'+folder_name)
    
    # Merge dataframes based on 'PUBCHEM_EXT_DATASOURCE_SMILES' column
    merged_data = pd.merge(f1, f2, on='PUBCHEM_EXT_DATASOURCE_SMILES', how='inner')
    # Get values that don't share a column
    not_in_merged = pd.concat([f1,f2]).drop_duplicates(subset='PUBCHEM_EXT_DATASOURCE_SMILES', keep=False)

    split_and_save(merged_data, 10, name+'good_')
    split_and_save(not_in_merged, 500, name+'regular_')

In [1]:
def get_noise_est(f1, f2):
    # Merge dataframes based on 'PUBCHEM_EXT_DATASOURCE_SMILES' column
    merged_data = pd.merge(f1, f2, on='PUBCHEM_EXT_DATASOURCE_SMILES', how='inner')
    # for each row get the variance of all columns but the first
    return len(merged_data)/max(len(f1), len(f2))

In [15]:
data1 = pd.read_csv('raw/AID_652039_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Inhibition at 7 uM']]

data2 = pd.read_csv('raw/AID_686949_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 = data2[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Activation at 7 uM [1]', 'Activation at 7 uM [2]', 'Activation at 7 uM [3]']]

name = '652039/652039_and_686949_'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)


(0.008771953580955734, 0.2506997387890302)

In [16]:
data1 = pd.read_csv('raw/AID_720582_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Inhibition at 7.0 uM']]


data2 = pd.read_csv('raw/AID_743254_datatable.csv', low_memory=False)
data2.drop([0,1,2,3,4], inplace=True)
new_data2 = data2[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Inhibition at 7.7 uM [1]', 'Inhibition at 7.7 uM [2]', 'Inhibition at 7.7 uM [3]']]

# Merge dataframes based on 'PUBCHEM_EXT_DATASOURCE_SMILES' column
#merged_data = pd.merge(new_data, new_data2, on='PUBCHEM_EXT_DATASOURCE_SMILES', how='inner')
# Get values that don't share a column
#not_in_merged = pd.concat([new_data, new_data2]).drop_duplicates(subset='PUBCHEM_EXT_DATASOURCE_SMILES', keep=False)

name = '720582/720582_and_743254_'
#split_and_save(merged_data, 10, name+'good_')
#split_and_save(not_in_merged, 500, name+'regular_')

#merged_data = pd.merge(new_data, new_data2, on='PUBCHEM_EXT_DATASOURCE_SMILES', how='outer')
# shuffled dataframe
#shuffled_data = merged_data.sample(frac=1)
#split_and_save(shuffled_data, 500, name)

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.0006346617118041677, 0.46185287213095794)

In [17]:
data1 = pd.read_csv('raw/AID_720704_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Inhibition at 12.2 uM']]

data2 = pd.read_csv('raw/AID_743261_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Inhibition at 12.3 uM [1]', 'Inhibition at 12.3 uM [2]', 'Inhibition at 12.3 uM [3]']]

# Merge dataframes based on 'PUBCHEM_EXT_DATASOURCE_SMILES' column
#merged_data = pd.merge(new_data, new_data2, on='PUBCHEM_EXT_DATASOURCE_SMILES', how='inner')
# Get values that don't share a column
#not_in_merged = pd.concat([new_data, new_data2]).drop_duplicates(subset='PUBCHEM_EXT_DATASOURCE_SMILES', keep=False)

name = '720704/720704_and_743261_'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.008825848826280937, 1.9752556824463614)

In [18]:
data1 = pd.read_csv('raw/AID_540281_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', '%Inhibition at 20 uM']]

data2 = pd.read_csv('raw/AID_493091_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', '%Inhibition at 20 uM']]

name = 'scp1/scp1_'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.007215578610209164, 1.4272904122983125)

In [26]:
data1 = pd.read_csv('raw/AID_602428_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', '%Activity at 5 uM_1','%Activity at 5 uM_2','%Activity at 5 uM_3']]

data2 = pd.read_csv('raw/AID_588413_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', '%Activity at 5 uM']]

name = 'gli_sufu/gli_sufu_'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.007304465973534972, 1.1039386486456777)

In [37]:
"""
data1 = pd.read_csv('new_data/raw/AID_504383_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Activation at 7.2 uM']]

data2 = pd.read_csv('new_data/raw/AID_493008_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Activation at 7.2 uM']]

name = 'rtf/rtf_'

make_double_data(new_data, new_data2, name)
"""

"\ndata1 = pd.read_csv('new_data/raw/AID_504383_datatable.csv', low_memory=False)\ndata1.drop([0,1,2,3], inplace=True)\nnew_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Activation at 7.2 uM']]\n\ndata2 = pd.read_csv('new_data/raw/AID_493008_datatable.csv', low_memory=False)\ndata2.drop([0,1,2,3], inplace=True)\nnew_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Activation at 7.2 uM']]\n\nname = 'rtf/rtf_'\n\nmake_double_data(new_data, new_data2, name)\n"

In [27]:
data1 = pd.read_csv('raw/AID_651783_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Activation at 3 uM [1]','Activation at 3 uM [2]','Activation at 3 uM [3]']]

data2 = pd.read_csv('raw/AID_624467_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Activation at 3 uM']]

name = 'TARR1/TARR1_'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.006630345819657901, 0.34419904397697654)

In [28]:
data1 = pd.read_csv('raw/AID_588473_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', '%Activity at 13.25 uM']]

data2 = pd.read_csv('raw/AID_602473_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', '% Activity at 13.25 uM_first_point', '% Activity at 13.25 uM_second_point','% Activity at 13.25 uM_third_point', '% Activity at 13.25 uM_fourth_point']]

name = 'CRF-R2/CRF-R2_'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.006933625074462723, 0.3744111388714163)

In [29]:
data1 = pd.read_csv('raw/AID_493034_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Inhibition at 3.39 uM']]

data2 = pd.read_csv('raw/AID_492953_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Inhibition at 3.39 uM']]

name = 'PAFAH1B2/PAFAH1B2_'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.007225877225668558, 0.17265084974216424)

In [30]:
data1 = pd.read_csv('raw/AID_504690_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', '%Activity at 20 uM']]

data2 = pd.read_csv('raw/AID_504753_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', '%Activity at 20 uM_1', '%Activity at 20 uM_2', '%Activity at 20 uM_3']]

name = 'PFG/PFG_'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.007010951748155615, 0.8083632360326435)

In [31]:
data1 = pd.read_csv('raw/AID_651636_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', '%Activity_Normalized at 5 uM']]

data2 = pd.read_csv('raw/AID_651997_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', '%Activity at 5 uM_1','%Activity at 5 uM_2','%Activity at 5 uM_3']]

name = 'EBI2/EBI2_'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.006755124008699281, 0.6593460841531141)

In [32]:
data1 = pd.read_csv('raw/AID_652134_datatable.csv', low_memory=False)
data1.drop([0,1,2,3], inplace=True)
new_data = data1[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Inhibition at 6.8 uM [1]','Inhibition at 6.8 uM [2]','Inhibition at 6.8 uM [3]']]

data2 = pd.read_csv('raw/AID_652010_datatable.csv', low_memory=False)
data2.drop([0,1,2,3], inplace=True)
new_data2 =data2[['PUBCHEM_EXT_DATASOURCE_SMILES', 'Inhibition at 6.8 uM']]

name = 'DAX1/DAX1'

#make_double_data(new_data, new_data2, name)
get_noise_est(new_data, new_data2)

(0.006624328020690869, 0.6007273938346401)