In [2]:
import os
import json
#import deepsmiles

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit import RDLogger
from rdkit.rdBase import DisableLog

for level in RDLogger._levels:
    DisableLog(level)
    
IPythonConsole.ipython_useSVG=False

from typing import NoReturn

In [3]:
def create_dir_for_images(folder_name: str) -> NoReturn:
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)

def create_filename(num: int) -> str:
    return str(num + 1) + '.png'

OK_CHARS = set(['C', 'O', 'S', 'N', 'B', 'P', 'H', 'F', 'I', 'l', 'r',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
               '#', '%', '(', ')', '=', '.', '/', '\\', '@'])

def check_chars(mol_str: str) -> bool:
    for c in mol_str:
        if not c in OK_CHARS:
            return False
    return True

def mol_is_okay(mol, mol_str) -> bool:
    if not mol:
        return False
    return check_chars(mol_str) and len(mol_str) <= 40

In [6]:
INPUT_FILENAME = 'raw_smiles.json'
FOLDER_NAME = '../images'
OUTPUT_FILENAME = 'num_of_images.json'

# read json from file
with open(INPUT_FILENAME, 'r') as json_file:
    data = json.load(json_file)
print('FINISH JSON DOWNLOADING')

# create folder for images
create_dir_for_images(FOLDER_NAME)

# create converter from smiles to deepsmiles
#converter = deepsmiles.Converter(rings=True, branches=True)

all_cnt = 1000
cnt = 0

# draw images and save information about number and SMILES in json
json_output = []
for num, mol_datum in enumerate(data):
    mol = Chem.MolFromSmiles(mol_datum['smiles'])
    if cnt >= all_cnt:
        break
    if mol_is_okay(mol, mol_datum['smiles']):
        canvas = Draw.rdMolDraw2D.MolDraw2DCairo(*(512, 512))
        canvas.drawOptions().setAtomPalette({-1:(0,0,0)})
        Draw.rdMolDraw2D.PrepareAndDrawMolecule(canvas, mol)
        canvas.WriteDrawingText(os.path.join(FOLDER_NAME, create_filename(num)))
        mol_json = {
            #'deepsmiles': converter.encode(mol_datum['smiles']),
            'smiles': mol_datum['smiles'], 
            'image_num': num + 1,
            #'fingerprint': Chem.AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024).ToList()
        }
        json_output.append(mol_json)
        cnt += 1
print(cnt)
print('FINISH MAKING JSON')
        
# save json to file
with open(OUTPUT_FILENAME, 'w') as outfile:
    json.dump(json_output, outfile, indent='\t')
print('FINISH SAVING JSON')

FINISH JSON DOWNLOADING
1000
FINISH MAKING JSON
FINISH SAVING JSON


In [18]:
with open(INPUT_FILENAME, 'r') as json_file:
    #data = json.load(json_file)
    text = json_file.read().replace('}\n\t{', '},\n\t{')
with open('tmp.json', 'w') as tmp_file:
    tmp_file.write(text)
with open('tmp.json', 'r') as json_file:
    data = json.load(json_file)
print('FINISH JSON DOWNLOADING')
json_tmp = []
for it in data:
    mol_json = {
        'smiles': it['isosmiles'],
    }
    json_tmp.append(mol_json)
with open('raw_smiles.json', 'w') as outfile:
    json.dump(json_tmp, outfile, indent='\t')

FINISH JSON DOWNLOADING
