<a href="https://colab.research.google.com/github/kaanwk/inviandmasters/blob/main/InviandMasters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports/Using Google Colab 


In [None]:
# Install RDKit.
%%capture
!pip install rdkit-pypi
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdChemReactions

from transformers import AutoTokenizer, RobertaModel
import torch

In [None]:
!pip install transformers
from transformers import AutoTokenizer

# Iterations of Learning

In [None]:
amount = 1000

# Data Cleaning and Reading

In [None]:
train = pd.read_csv('/reaction_train_dataset.csv')
test = pd.read_csv('/reaction_test_dataset.csv')

In [None]:
train.head()
test.head()

Unnamed: 0,reaction_id,reaction_smiles,mapped_reaction_smiles,ec_level_1,ec_level_2,ec_level_3,ec_level_4
0,48567,CCCCC[C@H](O)c1c(O)cc2c(c1O)C(=O)c1c(O)cc(O)cc...,[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][C@@H:12]([...,1,1.14,1.14.14,1.14.14.116
1,68073,C=CC(CC#N)OCc1ccccc1.O>>C=CC(CC(N)=O)OCc1ccccc1,[CH2:1]=[CH:2][CH:12]([CH2:8][C:9]#[N:13])[O:1...,4,4.2,4.2.1,4.2.1.84
2,55632,CSCC[C@H](NC(C)=O)C(=O)Nc1ccc2c(C)cc(=O)oc2c1....,[CH3:1][c:10]1[cH:8][c:16](=[O:21])[o:23][c:15...,3,3.4,3.4.19,3.4.19.1
3,65201,CN[C@@H]1[C@H](O[C@H]2[C@H](O[C@@H]3[C@@H](NC(...,[CH2:44]([C@@H:47]1[C@@H:49]([OH:59])[C@@H:50]...,2,2.7,2.7.1,2.7.1.72
4,27225,O.O=C1CCC(C(=O)O)=CN1>>N.N/C(=C\CC(=O)O)C(=O)O,[CH2:1]1[CH2:2][C:5](=[O:8])[NH:7][CH:3]=[C:4]...,3,3.5,3.5.2,3.5.2.18


# Training with Support Vector Machines


In [None]:
import json
from google.colab import drive
drive.mount("/content/drive", force_remount=True)


Mounted at /content/drive


In [None]:
# 
import random
x_train = []
for i in range(amount):

  while True:
    try:
      x = random.randint(1,amount)
      num = str(x)
      with open('/content/drive/MyDrive/reaction_chemberta_features.zip (Unzipped Files)/data/'+num+'.json') as json_data:
        data = json.load(json_data)
    except:
      continue
    else:
      break

  data_inside = np.concatenate((data['reactant_embedding'],data['product_embedding']),axis=0)

  x_train.append(data_inside)

x_train = np.array(x_train)
print(x_train.shape)

# Y Train
y_train = train['ec_level_1'][0:amount]

print(y_train.shape)


(1000, 1536)
(1000,)


In [None]:
# Importing to learner
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

svc = SVC()
clf = make_pipeline(StandardScaler(),SVC(gamma='auto'))
clf.fit(x_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

# Exporting to Drive

In [None]:
pd.DataFrame(x_train).to_csv("/content/drive/MyDrive/Copy of reaction_chemberta_features.zip (Unzipped Files)/PATH/export1.csv")

# Creating Prediction Function and Embedding Function


In [None]:
model = RobertaModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1") 

def get_molecule_embedding(smiles):
    inp = tokenizer.encode_plus(smiles)
    input_ids = torch.tensor([inp['input_ids']])
    attention_mask = torch.tensor([inp['attention_mask']])
    embedding = model(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = [round(i, 3) for i in embedding['pooler_output'].tolist()[0]]
    return pooled_output

def get_reaction_embeddings(rxn_smiles):
    reactant_smiles, product_smiles = rxn_smiles.split('>>')
    reactant_embedding = get_molecule_embedding(reactant_smiles)
    product_embedding = get_molecule_embedding(product_smiles)
    return {'rxn_smiles': rxn_smiles,
            'reactant_embedding': reactant_embedding,
            'product_embedding': product_embedding}

def predicting(rxn):
    embed = get_reaction_embeddings(rxn)
    embedCat = np.concatenate((embed['reactant_embedding'],embed['product_embedding']),axis=0)
    A = []
    A.append(embedCat)
    B = np.array(A)
    return clf.predict(B)



Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Prediction Model! 

Enter the chemical formula in SMILES format.

In [None]:
In = input("Enter the chemical formula in SMILES format.")


print("-------------------------")
print("")
print('EC Num: ',predicting(In))
print("")
print("-------------------------")

Enter the chemical formula in SMILES format.*C1=C(*)C(=O)C(*)=C(*)C1=O.*[C@@H](N)C(=O)O.O>>*CCC(=O)C(=O)O.*c1c(*)c(O)c(*)c(*)c1O.[NH4+]
-------------------------

EC Num:  [2]

-------------------------


# Samples

In [None]:
C/C(C)=C/CC/C(C)=C/CC/C(C)=C/COP(=O)(O)OP(=O)(O)O.CC(C)[C@H](NC(=O)[C@H](CS)NC(=O)CNS(=O)(=O)c1cccc2c(N(C)C)cccc12)C(=O)N[C@H](C(=O)N[C@@H](C)C(=O)O)[C@@H](C)O>>C/C(C)=C\CC/C(C)=C/CC/C(C)=C/CSC[C@H](NC(=O)CNS(=O)(=O)c1cccc2c(N(C)C)cccc12)C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](C)C(=O)O)[C@@H](C)O)C(C)C.O=P(O)(O)OP(=O)(O)O
Answer: 2




