<a href="https://colab.research.google.com/github/keinerfan/CIAP1/blob/CODES/screening_iap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install rdkit
# https://www.rdkit.org/
#https://github.com/rdkit/rdkit
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

# https://pandas.pydata.org
import pandas as pd

# https://numpy.org/doc/stable/release.html
import numpy as np

#https://github.com/mordred-descriptor/mordred
#from mordred import Calculator, descriptors

Collecting rdkit
  Downloading rdkit-2024.3.3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.3-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.3


# **reading txt file **

In [5]:
def read_smiles_from_file(file_path):

    # Read the file into a DataFrame with no header
    df = pd.read_csv(file_path, header=None, names=['line'])
    # Remove the leading index (assumed to be followed by a tab) from each line
    df['smiles'] = df['line'].str.split('\t', n=1).str[1]
    smiles_list = df['smiles'].tolist()
    return smiles_list
# Example usage
file_path = 'test_smiles_prediction.txt'  # Path to your text file containing SMILES notations
smiles_list = read_smiles_from_file(file_path)

df_smiles=pd.Series(smiles_list, name = 'canonical_smiles')
df_smiles


Unnamed: 0,canonical_smiles
0,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,CC(=O)OC(CC(=O)O)C[N+](C)(C)C
2,C1=CC(C(C(=C1)C(=O)O)O)O
3,CC(CN)O
4,C(C(=O)COP(=O)(O)O)N
5,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
6,CCN1C=NC2=C(N=CN=C21)N
7,CCC(C)(C(C(=O)O)O)O
8,C1(C(C(C(C(C1O)O)OP(=O)(O)O)O)O)O
9,C(CCl)Cl


In [6]:


smiles2 = []

for i in df_smiles.tolist():
  cpd = str(i).split('.')
  cpd_longest = max(cpd, key = len)
  smiles2.append(cpd_longest)

df_clean_smiles = pd.Series(smiles2, name = 'canonical_smiles')

df_clean_smiles



Unnamed: 0,canonical_smiles
0,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,CC(=O)OC(CC(=O)O)C[N+](C)(C)C
2,C1=CC(C(C(=C1)C(=O)O)O)O
3,CC(CN)O
4,C(C(=O)COP(=O)(O)O)N
5,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
6,CCN1C=NC2=C(N=CN=C21)N
7,CCC(C)(C(C(=O)O)O)O
8,C1(C(C(C(C(C1O)O)OP(=O)(O)O)O)O)O
9,C(CCl)Cl


In [7]:
def morgan_fpts(data):
    Morgan_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i)
        fpts =  AllChem.GetMorganFingerprintAsBitVect(mol,2,2048)
        mfpts = np.array(fpts)
        Morgan_fpts.append(mfpts)
    return np.array(Morgan_fpts)


In [8]:

# Generate Morgan fingerprints
morgan_fingerprints_f = morgan_fpts(df_clean_smiles)
morgan_fingerprints_f



array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
#Morgan_fpts = morgan_fpts(data['canonical_smiles'])

Morgan_fingerprints = pd.DataFrame(morgan_fingerprints_f)
Morgan_fingerprints

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
import joblib
pca=joblib.load('pca_ann_75c.pkl')
data = pca.transform(Morgan_fingerprints)
data.shape



(20, 75)

In [17]:
import torch
import torch.nn as nn

In [23]:

class Net(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)  # Added fourth layer
        #self.fc5 = nn.Linear(hidden_size, hidden_size)  #Added fifth layer
        self.fc_out = nn.Linear(hidden_size, out_size)
        self.ln1 = nn.LayerNorm(hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.ln3 = nn.LayerNorm(hidden_size)
        self.ln4 = nn.LayerNorm(hidden_size)
        #self.ln5 = nn.LayerNorm(hidden_size)  # Added layer norm for fifth layer
        self.activation = nn.LeakyReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc2(out)
        out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc3(out)
        out = self.ln3(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc4(out)
        out = self.ln4(out)  # Apply layer normalization
        out = self.activation(out)
        out = self.dropout(out)

        #out = self.fc5(out)
        #out = self.ln5(out)  # Apply layer normalization
        #out = self.activation(out)
        #out = self.dropout(out)

        out = self.fc_out(out)
        return out
# Define hyperparameters
input_size = 75
hidden_size = 1024
dropout_rate = 0.80
output_size = 1
learning_rate = 0.01
# Initialize the model
model = Net(input_size, hidden_size, dropout_rate, output_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the saved model parameters
model.load_state_dict(torch.load('model_parameters.pth', map_location=device))
X_new = torch.tensor(data, dtype=torch.float32).to(device)  # Replace `data` with your actual data


# Make predictions
with torch.no_grad():
    predictions = model(X_new)

# Convert predictions to numpy array if needed
predictions = pd.DataFrame(predictions.numpy())
output=pd.concat([df_clean_smiles, predictions], axis = 1)
output.to_csv('output.csv', index = False)
output

Unnamed: 0,canonical_smiles,0
0,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,4.663539
1,CC(=O)OC(CC(=O)O)C[N+](C)(C)C,4.779421
2,C1=CC(C(C(=C1)C(=O)O)O)O,5.429611
3,CC(CN)O,4.583893
4,C(C(=O)COP(=O)(O)O)N,4.89045
5,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl,4.393801
6,CCN1C=NC2=C(N=CN=C21)N,5.045292
7,CCC(C)(C(C(=O)O)O)O,4.409818
8,C1(C(C(C(C(C1O)O)OP(=O)(O)O)O)O)O,5.310087
9,C(CCl)Cl,4.677412
