In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
import pandas as pd
import numpy as np

import torch
from Smiles_Vector_Dataset import Smiles_Vector_Dataset

def save_dataset(df, dir, filename):
    smiles_list = df['SMILES'].tolist()
    vector_df = df.drop(['SMILES'], axis=1)
    vector_list = vector_df.values.tolist()
    vector_tensor = torch.tensor(vector_list)
    dataset = Smiles_Vector_Dataset(smiles_list, vector_tensor)
    torch.save(dataset, f"{dir}/{filename}.pt")

  from .autonotebook import tqdm as notebook_tqdm


## make smi

In [7]:
df_akt1 = pd.read_csv('./AKT1/AKT1_P31749.csv', header=0, sep=';')
df_akt1 = df_akt1.dropna(subset=['Smiles'])
df_aa2ar = pd.read_csv('./AA2AR/AA2AR_P29274.csv', header=0, sep=';')
df_aa2ar = df_aa2ar.dropna(subset=['Smiles'])

akt1_smiles = df_akt1['Smiles'].tolist()
print(f"len(akt1_smiles): {len(akt1_smiles)}")
aa2ar_smiles = df_aa2ar['Smiles'].tolist()
print(f"len(aa2ar_smiles): {len(aa2ar_smiles)}")

len(akt1_smiles): 5041
len(aa2ar_smiles): 9705


In [8]:
def extract_higher_MW(s_list: list) -> list:
    out = []
    for s in s_list:
        smis = s.split('.')
        mols = []
        for smi in smis:
            mol = Chem.MolFromSmiles(smi)
            if mol is not None:
                mols.append(mol)
        Mws = [Descriptors.MolWt(mol) for mol in mols]
        max_Mw_idx = Mws.index(max(Mws))
        out.append(Chem.MolToSmiles(mols[max_Mw_idx]))
    return out

def remove_dup(smis: list) -> list:
    smis = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smis]
    smis = list(set(smis))
    return smis

In [9]:
akt1_unique = remove_dup(extract_higher_MW(akt1_smiles))
print(f"len(akt1_smiles): {len(akt1_unique)}")

aa2ar_unique = remove_dup(extract_higher_MW(aa2ar_smiles))
print(f"len(aa2ar_smiles): {len(aa2ar_unique)}")

len(akt1_smiles): 4224
len(aa2ar_smiles): 7235


In [18]:
with open("./AKT1/AKT1_ligands.smi", 'w') as f:
    for smi in akt1_unique:
        f.write(smi + '\n')
with open("./AA2AR/AA2AR_ligands.smi", 'w') as f:
    for smi in aa2ar_unique:
        f.write(smi + '\n')

## make csv to calculate IEV

In [10]:
with open("./AKT1/AKT1_ligands.smi", 'r') as f:
    AKT1_ligands = f.read().splitlines()
with open("./AA2AR/AA2AR_ligands.smi", 'r') as f:
    AA2AR_ligands = f.read().splitlines()

In [11]:
print(f"len(AKT1_ligands): {len(AKT1_ligands)}")
print(f"len(AA2AR_ligands): {len(AA2AR_ligands)}")

len(AKT1_ligands): 4224
len(AA2AR_ligands): 7235


In [12]:
AKT1_names = ["AKT1_" + str(i) for i in range(len(AKT1_ligands))]
AA2AR_names = ["AA2AR_" + str(i) for i in range(len(AA2AR_ligands))]

df_AKT1 = pd.DataFrame({"SMILES": AKT1_ligands, "Name": AKT1_names})
df_AA2AR = pd.DataFrame({"SMILES": AA2AR_ligands, "Name": AA2AR_names})

In [13]:
import pandas as pd
import math

def split_dataframe(df):
    out = []
    
    # DataFrameの行数を取得
    total_rows = len(df)
    
    # 各分割の行数を計算（切り上げ）
    rows_per_split = math.ceil(total_rows / 3)
    
    for i in range(3):
        # 開始行と終了行を計算
        start_row = i * rows_per_split
        end_row = min((i + 1) * rows_per_split, total_rows)
        
        # DataFrameを分割
        split_df = df.iloc[start_row:end_row]
        
        # 分割したDataFrameをリストに追加
        out.append(split_df)
        
        print(f"Split {i+1} contains {len(split_df)} rows")
    
    return out

# DataFrameを分割
split_dfs_AKT1 = split_dataframe(df_AKT1)
split_df_AA2AR = split_dataframe(df_AA2AR)
print(f"len(split_dfs_AKT1): {len(split_dfs_AKT1)}")
print(f"len(split_df_AA2AR): {len(split_df_AA2AR)}")

Split 1 contains 1408 rows
Split 2 contains 1408 rows
Split 3 contains 1408 rows
Split 1 contains 2412 rows
Split 2 contains 2412 rows
Split 3 contains 2411 rows
len(split_dfs_AKT1): 3
len(split_df_AA2AR): 3


In [14]:
for i, split_df in enumerate(split_dfs_AKT1):
    split_df.to_csv(f"./AKT1/AKT1_ligands_{i}.csv", index=False, header=None)

for i, split_df in enumerate(split_df_AA2AR):
    split_df.to_csv(f"./AA2AR/AA2AR_ligands_{i}.csv", index=False, header=None)

## IEV

In [87]:
df_akt1_0 = pd.read_csv('./AKT1/AKT1_glide_HTVS_0_pv.interaction', header=None)
df_akt1_1 = pd.read_csv('./AKT1/AKT1_glide_HTVS_1_pv.interaction', header=None)
df_akt1_2 = pd.read_csv('./AKT1/AKT1_glide_HTVS_2_pv.interaction', header=None)
df_aa2ar_0 = pd.read_csv('./AA2AR/AA2AR_glide_HTVS_0_pv.interaction', header=None)
df_aa2ar_1 = pd.read_csv('./AA2AR/AA2AR_glide_HTVS_1_pv.interaction', header=None)
df_aa2ar_2 = pd.read_csv('./AA2AR/AA2AR_glide_HTVS_2_pv.interaction', header=None)
df_akt1 = pd.concat([df_akt1_0, df_akt1_1, df_akt1_2], axis=0)
df_aa2ar = pd.concat([df_aa2ar_0, df_aa2ar_1, df_aa2ar_2], axis=0)
print(f"len(df_akt1): {len(df_akt1)}")
print(f"len(df_aa2ar): {len(df_aa2ar)}")
df_akt1.to_csv('./AKT1/AKT1.interaction', index=False, header=None)
df_aa2ar.to_csv('./AA2AR/AA2AR.interaction', index=False, header=None)

len(df_akt1): 3993
len(df_aa2ar): 6789


## Make csv

In [2]:
df_akt1 = pd.read_csv('./AKT1/AKT1.interaction', header=0)
df_aa2ar = pd.read_csv('./AA2AR/AA2AR.interaction', header=0)

In [3]:
print(df_akt1.columns)
print(df_aa2ar.columns)

Index(['# title', 'ishit', 'A1018_vdw', 'A1018_coul', 'A1018_hbond',
       'A1037_vdw', 'A1037_coul', 'A1037_hbond', 'A1125_vdw', 'A1125_coul',
       ...
       'A442_vdw', 'A442_coul', 'A442_hbond', 'A443_vdw', 'A443_coul',
       'A443_hbond', 'C4_vdw', 'C4_coul', 'C4_hbond', 'docking_score'],
      dtype='object', length=210)
Index(['# title', 'ishit', 'A13_vdw', 'A13_coul', 'A13_hbond', 'A166_vdw',
       'A166_coul', 'A166_hbond', 'A167_vdw', 'A167_coul',
       ...
       'A84_vdw', 'A84_coul', 'A84_hbond', 'A85_vdw', 'A85_coul', 'A85_hbond',
       'A9_vdw', 'A9_coul', 'A9_hbond', 'docking_score'],
      dtype='object', length=180)


In [4]:
df_akt1.head()

Unnamed: 0,# title,ishit,A1018_vdw,A1018_coul,A1018_hbond,A1037_vdw,A1037_coul,A1037_hbond,A1125_vdw,A1125_coul,...,A442_vdw,A442_coul,A442_hbond,A443_vdw,A443_coul,A443_hbond,C4_vdw,C4_coul,C4_hbond,docking_score
0,AKT1_297,0,-0.13728450192277,0.0322838574273127,0.0,-0.0189094745673179,0.0955596191647923,0.0,-0.0084480101599724,-0.0176341707926451,...,0.621945652875025,0.569113123755522,0.0,-0.0658816645849666,-0.178330784320368,0.0,-1.27329841866409,2.06700402386047,0.0,-9.61026643467343
1,AKT1_314,0,-0.129914908574693,-0.146396155858314,0.0,-0.0214403182138179,-0.0264354793776174,0.0,-0.0068402100182805,0.0140130338754866,...,-1.86567944324785,-0.133552725775469,0.0,-0.0533777561205294,0.0082226462431368,0.0,2.66619781014919,-3.42213306883725,0.0,-9.50375531019234
2,AKT1_1371,0,-0.170075161500198,-0.610497524933297,0.0,-0.0266595736496788,0.125572595497559,0.0,-0.0059255729246144,-0.0480748153356219,...,-0.236998583222084,0.0608721985723467,0.0,-0.0347315024813036,0.134401652451795,0.0,-0.483372914778023,5.05065430498266,0.0,-9.26730455712028
3,AKT1_244,0,-0.0368625690508159,0.026848881375273,0.0,-0.0117804192539943,0.0184802476467823,0.0,-0.0064556491701388,0.0065144103289358,...,-1.28612661944228,-0.0292849459682985,0.0,-0.0435041683639845,-0.0164487492669711,0.0,-2.10108937230945,0.0108763495783885,0.0,-9.14862271207176
4,AKT1_1340,0,-0.0867239608453964,0.0898913254306153,0.0,-0.0222173704935967,-0.03991105014605,0.0,-0.0063821746551673,-0.0013650529829838,...,-0.435044870788673,0.0321951819899347,0.0,-0.0425843109942698,-0.0962949591133874,0.0,-1.25032489900673,0.17750218492442,0.0,-9.07829806967908


In [115]:
df_akt1_name_smiles = pd.read_csv('./AKT1/AKT1_ligands.csv', header=None, names=['SMILES', 'Name'])
df_aa2ar_name_smiles = pd.read_csv('./AA2AR/AA2AR_ligands.csv', header=None, names=['SMILES', 'Name'])
df_akt1_name_smiles.head()

Unnamed: 0,SMILES,Name
0,COCc1nnn(-c2nonc2N)c1C(=O)NN=C(C)c1ccc(O)cc1O,AKT1_0
1,COc1ccc(-c2ccc(NC(=O)Nc3ccc(F)c(C)c3)cc2)c2c(N...,AKT1_1
2,Cc1n[nH]c2cccc(-c3ccc(NC(=O)Nc4cccc(C(F)(F)F)c...,AKT1_2
3,CC1Cc2c(ccc(F)c2Cl)N1C(=O)Cc1nc(N2CCOCC2)cc(=O...,AKT1_3
4,CCCCCCCCCCCCc1ccc(S(=O)(=O)Nc2nnc(C(=O)OCC)s2)cc1,AKT1_4


In [116]:
for i, row in df_akt1.iterrows():
    name = row['# title']
    df_ = df_akt1_name_smiles[df_akt1_name_smiles['Name'] == name]    
    if len(df_) > 0:
        df_akt1.at[i, 'SMILES'] = df_.iloc[0]['SMILES']
df_akt1.head()

Unnamed: 0,# title,ishit,A1018_vdw,A1018_coul,A1018_hbond,A1037_vdw,A1037_coul,A1037_hbond,A1125_vdw,A1125_coul,...,A442_coul,A442_hbond,A443_vdw,A443_coul,A443_hbond,C4_vdw,C4_coul,C4_hbond,docking_score,SMILES
0,AKT1_297,0,-0.13728450192277,0.0322838574273127,0.0,-0.0189094745673179,0.0955596191647923,0.0,-0.0084480101599724,-0.0176341707926451,...,0.569113123755522,0.0,-0.0658816645849666,-0.178330784320368,0.0,-1.27329841866409,2.06700402386047,0.0,-9.61026643467343,CN(C)C(=O)CC(NC(=O)C1(N)CCN(c2ncnc3[nH]ccc23)C...
1,AKT1_314,0,-0.129914908574693,-0.146396155858314,0.0,-0.0214403182138179,-0.0264354793776174,0.0,-0.0068402100182805,0.0140130338754866,...,-0.133552725775469,0.0,-0.0533777561205294,0.0082226462431368,0.0,2.66619781014919,-3.42213306883725,0.0,-9.50375531019234,CN1CCN(C(C(=O)N2CCN(c3ncnc4[nH]cc(Br)c34)CC2)c...
2,AKT1_1371,0,-0.170075161500198,-0.610497524933297,0.0,-0.0266595736496788,0.125572595497559,0.0,-0.0059255729246144,-0.0480748153356219,...,0.0608721985723467,0.0,-0.0347315024813036,0.134401652451795,0.0,-0.483372914778023,5.05065430498266,0.0,-9.26730455712028,Cc1cc(C(=O)NC(CN)c2ccccc2)sc1-c1ccnc2[nH]ccc12
3,AKT1_244,0,-0.0368625690508159,0.026848881375273,0.0,-0.0117804192539943,0.0184802476467823,0.0,-0.0064556491701388,0.0065144103289358,...,-0.0292849459682985,0.0,-0.0435041683639845,-0.0164487492669711,0.0,-2.10108937230945,0.0108763495783885,0.0,-9.14862271207176,N[C@]1(CNC(=O)c2ccc(F)cc2)CCN(c2ncnc3[nH]cc(Cl...
4,AKT1_1340,0,-0.0867239608453964,0.0898913254306153,0.0,-0.0222173704935967,-0.03991105014605,0.0,-0.0063821746551673,-0.0013650529829838,...,0.0321951819899347,0.0,-0.0425843109942698,-0.0962949591133874,0.0,-1.25032489900673,0.17750218492442,0.0,-9.07829806967908,CC(C)c1ccc(C[C@@H](CN)C(=O)N2CCN(c3ncnc4[nH]cc...


In [117]:
for i, row in df_aa2ar.iterrows():
    name = row['# title']
    df_ = df_aa2ar_name_smiles[df_aa2ar_name_smiles['Name'] == name]    
    if len(df_) > 0:
        df_aa2ar.at[i, 'SMILES'] = df_.iloc[0]['SMILES']
df_aa2ar.head()

Unnamed: 0,# title,ishit,A13_vdw,A13_coul,A13_hbond,A166_vdw,A166_coul,A166_hbond,A167_vdw,A167_coul,...,A84_coul,A84_hbond,A85_vdw,A85_coul,A85_hbond,A9_vdw,A9_coul,A9_hbond,docking_score,SMILES
0,AA2AR_530,0,-0.0435500679066205,1.33029542456646,0.0,-0.21679676064461,0.222229310302139,0.0,-1.49425212149579,-0.305202166829867,...,-0.194030379756894,0.0,-0.356032736044833,-0.0585184441140454,0.0,-0.130243120900351,0.0525348037184132,0.0,-11.0660644312767,COc1cc(O)ccc1-c1cn2c(=O)n(-c3ccccc3)nc2c(N)n1
1,AA2AR_1601,0,-0.0433599490383122,1.09189356416712,0.0,-0.219351822402548,-0.0507082393278326,0.0,-1.34242202906572,-0.459084474723644,...,-0.202900809426255,0.0,0.602484486287843,-0.0485865807098461,0.0,-0.131435554142422,0.0366099811342904,0.0,-10.9341145105649,Nc1ccc(-c2cn3c(=O)n(-c4ccccc4)nc3c(N)n2)cc1
2,AA2AR_1159,0,-0.047898220214112,1.25253897959972,0.0,-0.278654666312648,0.0568238245918767,0.0,-0.885333808064984,-0.522604289354508,...,-0.0759710205250027,0.0,-0.217641232426314,-0.0689483775816858,0.0,-0.147860229347515,0.100559504229446,0.0,-10.9088592319435,Nc1ccc(-n2nc3c(N)nc(-c4ccccc4)cn3c2=O)cc1
3,AA2AR_1982,0,-0.140958755312277,0.645056564350114,0.0,-0.101560598740034,0.0812022728512635,0.0,-0.353088366462701,-0.0742654789510829,...,-0.228566330387575,0.0,-0.777063702767705,-0.0552144517016571,0.0,-0.177916191143004,0.123277907992996,0.0,-10.9058198835224,Nc1nc2cnn(CCc3ccccc3)c2c2nc(-c3ccco3)nn12
4,AA2AR_1596,0,-0.0478374677468686,1.4135561936587,0.0,-0.252406373447571,0.191062877895034,0.0,-1.83590448670065,-0.327011444098417,...,-0.0752634709149053,0.0,0.0122406182083924,-0.0753203820914461,0.0,-0.147128026104262,0.13733381580426,0.0,-10.8899171929138,COc1ccc(-n2nc3c(N)nc(-c4ccccc4)cn3c2=O)cc1


In [118]:
# drop if df['SMILES'] is NaN
len_akt1 = len(df_akt1)
len_aa2ar = len(df_aa2ar)
df_akt1 = df_akt1.dropna(subset=['SMILES'])
df_aa2ar = df_aa2ar.dropna(subset=['SMILES'])
print(f"len_akt1: {len_akt1} -> {len(df_akt1)}")
print(f"len_aa2ar: {len_aa2ar} -> {len(df_aa2ar)}")

df_akt1 = df_akt1.drop(['# title','ishit', 'docking_score'], axis=1)
df_aa2ar = df_aa2ar.drop(['# title','ishit', 'docking_score'], axis=1)

df_akt1.to_csv('./AKT1/AKT1_data.csv', index=False)
df_aa2ar.to_csv('./AA2AR/AA2AR_data.csv', index=False)

len_akt1: 3992 -> 3990
len_aa2ar: 6788 -> 6786


## Make Dataset

In [51]:
df_akt1 = pd.read_csv('./AKT1/AKT1_data.csv', header=0)
df_aa2ar = pd.read_csv('./AA2AR/AA2AR_data.csv', header=0)

In [52]:
# drop if df['SMILES'] duplicates
len1 = len(df_akt1)
df_akt1 = df_akt1.drop_duplicates(subset=['SMILES'])
len2 = len(df_akt1)
print(f'AKT1: {len1} -> {len2}')

len1 = len(df_aa2ar)
df_aa2ar = df_aa2ar.drop_duplicates(subset=['SMILES'])
len2 = len(df_aa2ar)
print(f'AA2AR: {len1} -> {len2}')


AKT1: 3990 -> 3576
AA2AR: 6786 -> 6640


### k-means

In [53]:
# AKT1

fps = []
for smi in df_akt1['SMILES']:
    mol = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
    fps.append(fp)
df_akt1['fingerprint'] = fps
X = np.array(fps)

# k-means clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=100, random_state=0).fit(X)
df_akt1['cluster_id'] = kmeans.labels_

# それぞれのクラスタから、中心となる構造を1つずつ抽出
cluster_centers = kmeans.cluster_centers_
cluster_centers = [list(x) for x in cluster_centers]

# 各クラスタから、cluster_centers に最も近いrowを抽出
closest_points = []
for i, center in enumerate(cluster_centers):
    data = df_akt1[df_akt1['cluster_id'] == i]
    fingerprints = np.array(list(data['fingerprint']))
    distances = np.linalg.norm(fingerprints - center, axis=1)
    closest_point_idx = np.argmin(distances)
    closest_point_smi = data.iloc[closest_point_idx]['SMILES']
    closest_points.append(closest_point_smi)

# remove smiles in closest points
df_akt1_train = df_akt1[~df_akt1['SMILES'].isin(closest_points)]
print(f"train data: {len(df_akt1_train)}")
df_akt1_test = df_akt1[df_akt1['SMILES'].isin(closest_points)]
print(f"test data: {len(df_akt1_test)}")

train data: 3476
test data: 100


In [47]:
# AA2AR

fps = []
for smi in df_aa2ar['SMILES']:
    mol = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
    fps.append(fp)
df_aa2ar['fingerprint'] = fps
X = np.array(fps)

# k-means clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=100, random_state=0).fit(X)
df_aa2ar['cluster_id'] = kmeans.labels_

# それぞれのクラスタから、中心となる構造を1つずつ抽出
cluster_centers = kmeans.cluster_centers_
cluster_centers = [list(x) for x in cluster_centers]

# 各クラスタから、cluster_centers に最も近いrowを抽出
closest_points = []
for i, center in enumerate(cluster_centers):
    data = df_aa2ar[df_aa2ar['cluster_id'] == i]
    fingerprints = np.array(list(data['fingerprint']))
    distances = np.linalg.norm(fingerprints - center, axis=1)
    closest_point_idx = np.argmin(distances)
    closest_point_smi = data.iloc[closest_point_idx]['SMILES']
    closest_points.append(closest_point_smi)

# remove smiles in closest points
df_aa2ar_train = df_aa2ar[~df_aa2ar['SMILES'].isin(closest_points)]
print(f"train data: {len(df_aa2ar_train)}")
df_aa2ar_test = df_aa2ar[df_aa2ar['SMILES'].isin(closest_points)]
print(f"test data: {len(df_aa2ar_test)}")

train data: 6540
test data: 100


In [55]:
df_akt1_train = df_akt1_train.drop(['fingerprint', 'cluster_id'], axis=1)
df_aa2ar_train = df_aa2ar_train.drop(['fingerprint', 'cluster_id'], axis=1)
df_akt1_test = df_akt1_test.drop(['fingerprint', 'cluster_id'], axis=1)
df_aa2ar_test = df_aa2ar_test.drop(['fingerprint', 'cluster_id'], axis=1)
df_akt1_train.head()

Unnamed: 0,A1018_vdw,A1018_coul,A1018_hbond,A1037_vdw,A1037_coul,A1037_hbond,A1125_vdw,A1125_coul,A1125_hbond,A154_vdw,...,A442_vdw,A442_coul,A442_hbond,A443_vdw,A443_coul,A443_hbond,C4_vdw,C4_coul,C4_hbond,SMILES
0,-0.137285,0.032284,0.0,-0.018909,0.09556,0.0,-0.008448,-0.017634,0.0,-0.069575,...,0.621946,0.569113,0.0,-0.065882,-0.178331,0.0,-1.273298,2.067004,0.0,CN(C)C(=O)CC(NC(=O)C1(N)CCN(c2ncnc3[nH]ccc23)C...
1,-0.129915,-0.146396,0.0,-0.02144,-0.026435,0.0,-0.00684,0.014013,0.0,-0.063079,...,-1.865679,-0.133553,0.0,-0.053378,0.008223,0.0,2.666198,-3.422133,0.0,CN1CCN(C(C(=O)N2CCN(c3ncnc4[nH]cc(Br)c34)CC2)c...
2,-0.170075,-0.610498,0.0,-0.02666,0.125573,0.0,-0.005926,-0.048075,0.0,-0.050247,...,-0.236999,0.060872,0.0,-0.034732,0.134402,0.0,-0.483373,5.050654,0.0,Cc1cc(C(=O)NC(CN)c2ccccc2)sc1-c1ccnc2[nH]ccc12
3,-0.036863,0.026849,0.0,-0.01178,0.01848,0.0,-0.006456,0.006514,0.0,-0.057571,...,-1.286127,-0.029285,0.0,-0.043504,-0.016449,0.0,-2.101089,0.010876,0.0,N[C@]1(CNC(=O)c2ccc(F)cc2)CCN(c2ncnc3[nH]cc(Cl...
4,-0.086724,0.089891,0.0,-0.022217,-0.039911,0.0,-0.006382,-0.001365,0.0,-0.055296,...,-0.435045,0.032195,0.0,-0.042584,-0.096295,0.0,-1.250325,0.177502,0.0,CC(C)c1ccc(C[C@@H](CN)C(=O)N2CCN(c3ncnc4[nH]cc...


In [57]:
save_dataset(df_akt1_train, './AKT1', 'AKT1_train')
save_dataset(df_akt1_test, './AKT1', 'AKT1_test')
save_dataset(df_aa2ar_train, './AA2AR', 'AA2AR_train')
save_dataset(df_aa2ar_test, './AA2AR', 'AA2AR_test')

### DRD2

In [70]:
drd2_train = torch.load('../data/drd2_train_dataset_no_dot.pt')
drd2_test = torch.load('../data/drd2_test_dataset_no_dot.pt')
smis, ievs = [], []
for smi, iev in drd2_train:
    smis.append(smi)
    ievs.append(iev.tolist())
for smi, iev in drd2_test:
    smis.append(smi)
    ievs.append(iev.tolist())
df_drd2 = pd.DataFrame({"SMILES": smis, "IEV": ievs})
print(f'len(df_drd2): {len(df_drd2)}')
df_drd2.head()

len(df_drd2): 8350


Unnamed: 0,SMILES,IEV
0,NC(=O)c1ccc2c(c1)CCOC2CCN1CC=C(c2c[nH]c3cc(F)c...,"[-2.9402904767574, 0.23517850742004, 0.0, -0.0..."
1,Cn1c(=O)c2c(nc3n(CCCCN4CCN(c5ccccc5O)CC4)c(-c4...,"[-3.32895210342961, 0.175525895566323, 0.0, -0..."
2,COc1ccc2cc(S(=O)(=O)N[C@@H]3CCN(CCCc4noc5ccccc...,"[-2.76044333354232, -0.221698177198241, 0.0, -..."
3,NC(=O)c1ccc(S(=O)(=O)C2CCN(CCc3ccc(F)cc3F)CC2)cc1,"[-1.70916884179555, -0.0399866746911523, 0.0, ..."
4,O=C1CCc2cc(F)ccc2N1CCCN1CCC(n2c(=O)[nH]c3ccccc...,"[-3.17040813384477, 0.360624468886975, 0.0, -0..."


In [71]:
# DRD2

fps = []
for smi in df_drd2['SMILES']:
    mol = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
    fps.append(fp)
df_drd2['fingerprint'] = fps
X = np.array(fps)

# k-means clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=100, random_state=0).fit(X)
df_drd2['cluster_id'] = kmeans.labels_

# それぞれのクラスタから、中心となる構造を1つずつ抽出
cluster_centers = kmeans.cluster_centers_
cluster_centers = [list(x) for x in cluster_centers]

# 各クラスタから、cluster_centers に最も近いrowを抽出
closest_points = []
for i, center in enumerate(cluster_centers):
    data = df_drd2[df_drd2['cluster_id'] == i]
    fingerprints = np.array(list(data['fingerprint']))
    distances = np.linalg.norm(fingerprints - center, axis=1)
    closest_point_idx = np.argmin(distances)
    closest_point_smi = data.iloc[closest_point_idx]['SMILES']
    closest_points.append(closest_point_smi)

# remove smiles in closest points
df_drd2_train = df_drd2[~df_drd2['SMILES'].isin(closest_points)]
print(f"train data: {len(df_drd2_train)}")
df_drd2_test = df_drd2[df_drd2['SMILES'].isin(closest_points)]
print(f"test data: {len(df_drd2_test)}")

train data: 8250
test data: 100


In [72]:
df_drd2_train = df_drd2_train.drop(['fingerprint', 'cluster_id'], axis=1)
df_drd2_test = df_drd2_test.drop(['fingerprint', 'cluster_id'], axis=1)
df_drd2_test.head()

Unnamed: 0,SMILES,IEV
53,O=C1CCc2ccccc2N1CCCCN1CCN(c2ccccc2)CC1,"[-2.48645256988325, -0.14970634049982, 0.0, -0..."
55,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,"[-1.08370736426092, 0.168907235016713, 0.0, -0..."
243,CCCN(CCCCNC(=O)c1ccc2ccccc2c1)CC1CC1c1ccc(F)cc1,"[-2.15803026328789, -0.0489480768486949, 0.0, ..."
269,O=c1[nH]c2cccc(N3CCN(CC4CCc5[nH]c6ccc(F)cc6c5C...,"[-2.08320944068261, -0.170275148167311, 0.0, -..."
396,NC1CCc2c(O)cccc2C1,"[-0.013795854924049, 0.0213154603645741, 0.0, ..."


In [80]:
smiles_list = df_drd2_train['SMILES'].tolist()
vector_df = df_drd2_train['IEV']
vector_list = vector_df.tolist()
vector_tensor = torch.tensor(vector_list)
dataset = Smiles_Vector_Dataset(smiles_list, vector_tensor)
torch.save(dataset, './DRD2/DRD2_train.pt')

smiles_list = df_drd2_test['SMILES'].tolist()
vector_df = df_drd2_test['IEV']
vector_list = vector_df.tolist()
vector_tensor = torch.tensor(vector_list)
dataset = Smiles_Vector_Dataset(smiles_list, vector_tensor)
torch.save(dataset, './DRD2/DRD2_test.pt')

In [83]:
drd2_train = torch.load('./DRD2/DRD2_train.pt')
akt1_train = torch.load('./AKT1/AKT1_train.pt')
aa2ar_train = torch.load('./AA2AR/AA2AR_train.pt')

drd2_test = torch.load('./DRD2/DRD2_test.pt')
akt1_test = torch.load('./AKT1/AKT1_test.pt')
aa2ar_test = torch.load('./AA2AR/AA2AR_test.pt')

print(f'len(drd2_train): {len(drd2_train)}')
print(f'len(akt1_train): {len(akt1_train)}')
print(f'len(aa2ar_train): {len(aa2ar_train)}')
print(f'len(drd2_test): {len(drd2_test)}')
print(f'len(akt1_test): {len(akt1_test)}')
print(f'len(aa2ar_test): {len(aa2ar_test)}')

len(drd2_train): 8250
len(akt1_train): 3476
len(aa2ar_train): 6540
len(drd2_test): 100
len(akt1_test): 100
len(aa2ar_test): 100


In [9]:
# random split data into train, validation, and test
import numpy as np

np.random.seed(0)
idx = np.random.permutation(len(df_akt1))
df_akt1 = df_akt1.iloc[idx]
df_akt1_train = df_akt1.iloc[:int(0.8*len(df_akt1))]
df_akt1_valid = df_akt1.iloc[int(0.8*len(df_akt1)):int(0.9*len(df_akt1))]
df_akt1_test = df_akt1.iloc[int(0.9*len(df_akt1)):]

idx = np.random.permutation(len(df_aa2ar))
df_aa2ar = df_aa2ar.iloc[idx]
df_aa2ar_train = df_aa2ar.iloc[:int(0.8*len(df_aa2ar))]
df_aa2ar_valid = df_aa2ar.iloc[int(0.8*len(df_aa2ar)):int(0.9*len(df_aa2ar))]
df_aa2ar_test = df_aa2ar.iloc[int(0.9*len(df_aa2ar)):]

In [15]:
import numpy as np

np.random.seed(0)
idx = np.random.permutation(len(df_akt1))
df_akt1 = df_akt1.iloc[idx]
df_akt1_train_extend = df_akt1.iloc[:int(len(df_akt1)-100)]
df_akt1_test_extend = df_akt1.iloc[int(len(df_akt1)-100):]
print(f'len(df_akt1_train_extend): {len(df_akt1_train_extend)}')
print(f'len(df_akt1_test_extend): {len(df_akt1_test_extend)}')

idx = np.random.permutation(len(df_aa2ar))
df_aa2ar = df_aa2ar.iloc[idx]
df_aa2ar_train_extend = df_aa2ar.iloc[:int(len(df_aa2ar)-100)]
df_aa2ar_test_extend = df_aa2ar.iloc[int(len(df_aa2ar)-100):]
print(f'len(df_aa2ar_train_extend): {len(df_aa2ar_train_extend)}')
print(f'len(df_aa2ar_test_extend): {len(df_aa2ar_test_extend)}')



len(df_akt1_train_extend): 3890
len(df_akt1_test_extend): 100
len(df_aa2ar_train_extend): 6686
len(df_aa2ar_test_extend): 100


In [17]:
save_dataset(df_akt1_train, './AKT1', 'AKT1_train')
save_dataset(df_akt1_valid, './AKT1', 'AKT1_valid')
save_dataset(df_akt1_test, './AKT1', 'AKT1_test')
save_dataset(df_aa2ar_train, './AA2AR', 'AA2AR_train')
save_dataset(df_aa2ar_valid, './AA2AR', 'AA2AR_valid')
save_dataset(df_aa2ar_test, './AA2AR', 'AA2AR_test')

save_dataset(df_akt1_train_extend, './AKT1', 'AKT1_train_extend')
save_dataset(df_akt1_test_extend, './AKT1', 'AKT1_test_extend')
save_dataset(df_aa2ar_train_extend, './AA2AR', 'AA2AR_train_extend')
save_dataset(df_aa2ar_test_extend, './AA2AR', 'AA2AR_test_extend')

In [13]:
df = df_akt1_train
smiles_list = df['SMILES'].tolist()

In [18]:
vector_df = df.drop(['SMILES'], axis=1)
vector_list = vector_df.values.tolist()
vector_tensor = torch.tensor(vector_list)
vector_tensor.size()

torch.Size([3192, 207])

In [5]:
train = torch.load('./DRD2/DRD2_train.pt')
test = torch.load('./DRD2/DRD2_test.pt')

train_smi = []
for smi, _ in train:
    train_smi.append(smi)
test_smi = []
for smi, _ in test:
    test_smi.append(smi)

with open('./DRD2/DRD2_train.smi', 'w') as f:
    for smi in train_smi:
        f.write(smi + '\n')
with open('./DRD2/DRD2_test.smi', 'w') as f:
    for smi in test_smi:
        f.write(smi + '\n')

In [6]:
train = torch.load('./AKT1/AKT1_train.pt')
test = torch.load('./AKT1/AKT1_test.pt')

train_smi = []
for smi, _ in train:
    train_smi.append(smi)
test_smi = []
for smi, _ in test:
    test_smi.append(smi)

with open('./AKT1/AKT1_train.smi', 'w') as f:
    for smi in train_smi:
        f.write(smi + '\n')
with open('./AKT1/AKT1_test.smi', 'w') as f:
    for smi in test_smi:
        f.write(smi + '\n')

train = torch.load('./AA2AR/AA2AR_train.pt')
test = torch.load('./AA2AR/AA2AR_test.pt')

train_smi = []
for smi, _ in train:
    train_smi.append(smi)
test_smi = []
for smi, _ in test:
    test_smi.append(smi)

with open('./AA2AR/AA2AR_train.smi', 'w') as f:
    for smi in train_smi:
        f.write(smi + '\n')

with open('./AA2AR/AA2AR_test.smi', 'w') as f:
    for smi in test_smi:
        f.write(smi + '\n')


## Make sdf

In [3]:
def make_csv(train, test, protein):    
    len_train = len(train)
    len_test = len(test)
    names_train = [f'{protein}_{i}' for i in range(len_train)]
    df_train = pd.DataFrame({"SMILES": train, "Name": names_train})
    names_test = [f'{protein}_{i}' for i in range(len_test)]
    df_test = pd.DataFrame({"SMILES": test, "Name": names_test})
    df_train.to_csv(f'./{protein}/{protein}_train.csv', index=False)
    df_test.to_csv(f'./{protein}/{protein}_test.csv', index=False)

with open('./AKT1/AKT1_train.smi', 'r') as f:
    train_smi = f.read().splitlines()
with open('./AKT1/AKT1_test.smi', 'r') as f:
    test_smi = f.read().splitlines()
make_csv(train_smi, test_smi, 'AKT1')

with open('./AA2AR/AA2AR_train.smi', 'r') as f:
    train_smi = f.read().splitlines()
with open('./AA2AR/AA2AR_test.smi', 'r') as f:
    test_smi = f.read().splitlines()
make_csv(train_smi, test_smi, 'AA2AR')

with open('./DRD2/DRD2_train.smi', 'r') as f:
    train_smi = f.read().splitlines()
with open('./DRD2/DRD2_test.smi', 'r') as f:
    test_smi = f.read().splitlines()
make_csv(train_smi, test_smi, 'DRD2')

In [5]:
test_DRD2 = torch.load('./DRD2/DRD2_test.pt')
smis = []
for smi, _ in test_DRD2:
    smis.append(smi)
smis_set = list(set(smis))
print(len(smis_set))

100


## extract test names

In [4]:
df_akt1_name_smiles = pd.read_csv('./AKT1/AKT1_ligands.csv', header=None, names=['SMILES', 'Name'])
df_aa2ar_name_smiles = pd.read_csv('./AA2AR/AA2AR_ligands.csv', header=None, names=['SMILES', 'Name'])

In [5]:
with open('./AKT1/AKT1_test.smi', 'r') as f:
    AKT1_test = f.read().splitlines()

with open('./AA2AR/AA2AR_test.smi', 'r') as f:
    AA2AR_test = f.read().splitlines()

In [6]:
akt1_names, aa2ar_names = [], []

for smi in AKT1_test:
    df_ = df_akt1_name_smiles[df_akt1_name_smiles['SMILES'] == smi]
    if len(df_) > 0:
        akt1_names.append(df_.iloc[0]['Name'])
print(len(akt1_names))

for smi in AA2AR_test:
    df_ = df_aa2ar_name_smiles[df_aa2ar_name_smiles['SMILES'] == smi]
    if len(df_) > 0:
        aa2ar_names.append(df_.iloc[0]['Name'])
print(len(aa2ar_names))

100
100


In [8]:
with open('./AKT1/AKT1_test_name.smi', 'w') as f:
    for name in akt1_names:
        f.write(name + '\n')
with open('./AA2AR/AA2AR_test_name.smi', 'w') as f:
    for name in aa2ar_names:
        f.write(name + '\n')

In [12]:
for i in range(1, 500):
    if 7377 % i == 1:
        max = i
print(f"njobs:{max}")

njobs:461


In [13]:
# Load sdf file
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw

suppl = Chem.SDMolSupplier('./AKT1/glide_AKT1_pv.sdf')

#count mols
mols = [mol for mol in suppl]
print(len(mols))

4609
