The notebooks illustrates how Ring2vec can be used to featurize  molecules 
* [Featurize the molecules using pre-trained Ring2vec model](#Featurize-the-molecules-using-pre-trained-ring2vec-model)


In [None]:
import numpy as np
import pandas as pd

def clean_smiles_and_save(input_csv, output_csv):
    # load data
    df = pd.read_csv(input_csv)
    
    # Attempt to convert SMILES to molecule objects
    df['Mol'] = df['SMILES'].apply(Chem.MolFromSmiles)
    
    # Remove rows with failed conversion (invalid molecules)
    df = df[df['Mol'].notnull()]
    
    # Delete the 'Mol' column, as it cannot be correctly saved to a CSV file
    df.drop(columns=['Mol'], inplace=True)
    
    # Save the processed data to a new CSV file
    df.to_csv(output_csv, index=False)
    
    print(f"The cleaned data has been saved to {output_csv}")



In [None]:
# Remove duplicate data


In [9]:
# % matplotlib inline
import numpy as np
import pandas as pd
import sys, os
# Add the ring2vec directory to the system path.
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd()))+'/ring2vec')
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from Ringfeature import *
from gensim.models import word2vec

df = pd.read_csv('data/NFAs_cleaned.csv')
df.columns = df.columns.str.strip()

model = word2vec.Word2Vec.load('models/pretmodelskip4_100dim.pkl')

df['Mol'] = df['SMILES'].apply(Chem.MolFromSmiles)

df = df.dropna(subset=['Mol'])

df['Sentence'] = df['Mol'].apply(ring2alt_sentence)  

# sentences = [sentence.split() for sentence in df['Sentence'].astype(str)]
sentences = [sentence for sentence in df['Sentence'] if sentence]

df['ring2vec'] = [DfVec(x) for x in sentences2vec(sentences, model)]

df['Features'] = df['ring2vec'].apply(lambda x: ','.join(map(str, x.vec)))

df_to_save = df[['SMILES', 'Features', 'HOMO', 'LUMO', 'bandgap']]
df_to_save.to_csv('data/NFAs_prering2alt.csv', index=False)


In [None]:
# test
# % matplotlib inline
import numpy as np
import pandas as pd
import sys, os
# Add the ring2vec directory to the system path.
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd()))+'/ring2vec')
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from Ringfeature import ring2alt_sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec

df = pd.read_csv('data/NFAs_cleaned.csv')

model = word2vec.Word2Vec.load('models/modelskip4_100dim.pkl')

PandasTools.AddMoleculeColumnToFrame(df, 'SMILES', 'ROMol')
 
df['Mol'] = df['SMILES'].apply(Chem.MolFromSmiles)

df = df.dropna(subset=['Mol'])

df['Sentence'] = df['Mol'].apply(ring2alt_sentence)

print(df['Sentence'].head())

df['sentence'] = df.apply(lambda x: MolSentence(ring2alt_sentence(x['ROMol'])), axis=1)

sentences = [sentence for sentence in df['Sentence'] if sentence]

print(sentences[:5])

df['ring2vec'] = [DfVec(x) for x in sentences2vec(sentences, model)]

df['vector_string'] = df['ring2vec'].apply(lambda x: ','.join(map(str, x.vec)))
# PCE_max(%),PCE_ave(%),Jsc(mA/cm2),FF,Voc(V),HOMO(eV),LUMO(eV),Eg(eV),
df_to_save = df[['SMILES','vector_string','PCE_max(%)','PCE_ave(%)','Jsc(mA/cm2)','FF','Voc(V)','HOMO(eV)','LUMO(eV)','Eg(eV)']]

df_to_save.to_csv('data/ring2alt_100.csv', index=False)