# Basic concepts of Ring2vec pretraining

This notebook covers basic concepts of [ring2vec]



In [None]:
! pip install joblib
import re
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
import matplotlib.pyplot as plt
import seaborn as sns
from Ringfeature import *
from helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
from gensim.models import Word2Vec
import timeit
# Load CSV file containing SMILES
df = pd.read_csv('data/CEPDB.csv')

# Convert SMILES to RDKit molecule object
df['Mol'] = df['smiles'].apply(Chem.MolFromSmiles)

# Generate a sentence representation for each molecule's ring
df['Sentence'] = df['Mol'].apply(ring2sentence)
# singlering2sentence combring2sentence ring2sentence ring2alt_sentence
print(df['Sentence'].head())

sentences = [sentence for sentence in df['Sentence'] if sentence]
print("Sentences prepared for Word2Vec:")
print(sentences[:5])  # Print the first 5 sentences
# Set the parameters of the model.
vector_size = 100  # Vector dimension.
window = 10  # Window size
min_count = 3  # Minimum occurrence count
n_jobs = 4  # Number of threads used
# method = 'skip-gram'  # Training method.
# sg = 1 if method == 'skip-gram' else 0

method = 'cbow'  # Training method.
sg = 0 if method == 'cbow' else 1

# Record the start time of model training.
start = timeit.default_timer()

# Train the model using Word2Vec.
model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=n_jobs, sg=sg)

# Record and print the time taken for model training.
stop = timeit.default_timer()

print('Runtime: ', round((stop - start)/60, 2), ' minutes')

# View information about the trained model.
print(model)

# If need to save the model, you can use the following command
model.save("models/pretmodelcbow3_100dim.pkl")


In [None]:
# test
! pip install joblib
import re
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
import matplotlib.pyplot as plt
import seaborn as sns
from Ringfeature import *
from helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
# Load CSV file containing SMILES
df = pd.read_csv('data/NFAs_cleaned.csv')
print('DataFrame loaded, number of rows:', len(df))

# Convert SMILES to RDKit molecule object.
df['Mol'] = df['SMILES'].apply(Chem.MolFromSmiles)

df['Sentence'] = df['Mol'].apply(singlering2sentence)
print("Checking the format of generated sentences:")
print(df['Sentence'].head())
# Prepare sentences for Word2Vec input
sentences = [sentence for sentence in df['Sentence'] if sentence]

print("Sentences prepared for Word2Vec:")
print(sentences[:5])  # Print the first 5 sentences.


In [None]:
from gensim.models import Word2Vec

# load model
model = Word2Vec.load('models/pretmodelskip4_100dim.pkl')

# Get the vocabulary
vocab = list(model.wv.key_to_index.keys())

print(len(vocab))
# Print the vocabulary
print(vocab)

# If want to see the dimensions of each word vector, you can do the following:
for word in vocab:
    print(f"Word: {word} \tVector: {model.wv[word]}")


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

def get_ring_fingerprints(mol):
    """Extract rings from molecules and obtain Morgan fingerprints for the rings"""
    sssr = Chem.GetSymmSSSR(mol)
    ring_fps = []

    for ring in sssr:
        ring_atoms = list(ring)
        # Generate Morgan fingerprints for the entire ring
        ring_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useFeatures=True, useChirality=True, atomsToUse=ring_atoms)
        ring_fps.append(list(ring_fp))  # Convert RDKit's ExplicitBitVect to a list.

    return ring_fps

# Example usage.
mol = Chem.MolFromSmiles('CCCCCCCCCCCc1c(/C=C2/C(=O)c3cc(F)c(F)cc3C2=C(C#N)C#N)sc2c1sc1c3c4nsnc4c4c5sc6c(CCCCCCCCCCC)c(/C=C7/C(=O)c8cc(F)c(F)cc8C7=C(C#N)C#N)sc6c5n(CC(CC)CCCC)c4c3n(CC(CC)CCCC)c21')
ring_fps = get_ring_fingerprints(mol)

# Output fingerprints for the first ring
print(ring_fps[0])
