In [2]:
from tqdm import tqdm
from collections import Counter, defaultdict
from rdkit import Chem
from MACCSkeys import smartsPatts

##### Checking #occurrences of each MACCS key

In [2]:
if False:
    rxncounts_per_key = Counter()
    maccs_keys_to_skip = set([1, 44, 125, 166])

    num_rxns = sum(1 for line in open("../data/raw/train.txt", "r"))

    with open("../data/raw/train.txt", "r") as train_dataset:
        for reaction in tqdm(train_dataset, total = num_rxns):

            lhs, rhs = reaction.split(">>")
            lhs_mol = Chem.MolFromSmiles(lhs)
            rhs_mol = Chem.MolFromSmiles(rhs)

            for maccs_key in smartsPatts:

                if maccs_key in maccs_keys_to_skip:
                    continue

                maccs_smarts = smartsPatts[maccs_key][0]
                maccs_pattern = Chem.MolFromSmarts(maccs_smarts)

                if (lhs_mol.HasSubstructMatch(maccs_pattern)
                    or rhs_mol.HasSubstructMatch(maccs_pattern)):
                    rxncounts_per_key.update({maccs_key: 1})

    import pickle

    # Store data (serialize)
    with open('rxncounts_per_key.pickle', 'wb') as handle:
        pickle.dump(rxncounts_per_key, handle, protocol=pickle.HIGHEST_PROTOCOL)


##### Checking if every reaction has a MACCS key

In [3]:
if False:
    n_valid_reactions = 0
    maccs_keys_to_skip = set([1, 44, 125, 166])

    num_rxns = sum(1 for line in open("../data/raw/train.txt", "r"))

    with open("../data/raw/train.txt", "r") as train_dataset:
        for reaction in tqdm(train_dataset, total = num_rxns):

            lhs, rhs = reaction.split(">>")
            lhs_mol = Chem.MolFromSmiles(lhs)
            rhs_mol = Chem.MolFromSmiles(rhs)

            lhs_found = False
            rhs_found = False

            for maccs_key in smartsPatts:

                if maccs_key in maccs_keys_to_skip:
                    continue

                maccs_smarts = smartsPatts[maccs_key][0]
                maccs_pattern = Chem.MolFromSmarts(maccs_smarts)

                if lhs_found and rhs_found:
                    break

                if not lhs_found:
                    if lhs_mol.HasSubstructMatch(maccs_pattern):
                        lhs_found = True

                if not rhs_found:
                    if rhs_mol.HasSubstructMatch(maccs_pattern):
                        rhs_found = True

            if lhs_found and rhs_found:
                n_valid_reactions += 1


    valid_rxn_stats = {
        'n_valid_reactions': n_valid_reactions,
        'total_reactions': num_rxns,
    }

    import pickle

    # Store data (serialize)
    with open('valid_rxn_stats.pickle', 'wb') as handle:
        pickle.dump(valid_rxn_stats, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print(num_rxns - n_valid_reactions) # 0


100%|██████████| 409035/409035 [08:49<00:00, 772.95it/s]


##### Checking if every reaction is a MACCS transformation (only LHS)

In [4]:
if False:
    n_assumption_true_rxns = 0
    maccs_keys_to_skip = set([1, 44, 125, 166])

    num_rxns = sum(1 for line in open("../data/raw/train.txt", "r"))

    with open("../data/raw/train.txt", "r") as train_dataset:
        for reaction in tqdm(train_dataset, total = num_rxns):

            lhs, rhs = reaction.split(">>")
            lhs_mol = Chem.MolFromSmiles(lhs)
            rhs_mol = Chem.MolFromSmiles(rhs)

            # get maccs matched atom maps
            all_maccs_atom_maps = set()
            for maccs_key in smartsPatts:

                if maccs_key in maccs_keys_to_skip:
                    continue

                maccs_smarts = smartsPatts[maccs_key][0]
                maccs_pattern = Chem.MolFromSmarts(maccs_smarts)

                match_tuples_list = lhs_mol.GetSubstructMatches(maccs_pattern)
                for match_tuple in match_tuples_list:
                    for atom_id in match_tuple:
                        atom = lhs_mol.GetAtomWithIdx(atom_id)
                        all_maccs_atom_maps.add(atom.GetAtomMapNum())

            # find bonds changed atom map numbers
            lhs_amap_to_nbr_amaps = defaultdict(set)
            rhs_amap_to_nbr_amaps = defaultdict(set)

            for atom in lhs_mol.GetAtoms():
                atom_map = atom.GetAtomMapNum()
                for nbr_atom in atom.GetNeighbors():
                    lhs_amap_to_nbr_amaps[atom_map].add(nbr_atom.GetAtomMapNum())

            for atom in rhs_mol.GetAtoms():
                atom_map = atom.GetAtomMapNum()
                for nbr_atom in atom.GetNeighbors():
                    rhs_amap_to_nbr_amaps[atom_map].add(nbr_atom.GetAtomMapNum())

            # check if bonds changed atoms are all maccs matched atoms
            rxn_valid = True
            for atom_map in lhs_amap_to_nbr_amaps:
                if lhs_amap_to_nbr_amaps[atom_map] != rhs_amap_to_nbr_amaps[atom_map]:
                    if atom_map not in all_maccs_atom_maps:
                        rxn_valid = False
                        break

            if rxn_valid:
                n_assumption_true_rxns += 1

    import pickle

    rxn_assumptions_stats = {
        "n_assumption_true_rxns": n_assumption_true_rxns,
        "num_rxns": num_rxns,
    }

    # Store data (serialize)
    with open('rxn_assumptions_stats.pickle', 'wb') as handle:
        pickle.dump(rxn_assumptions_stats, handle, protocol=pickle.HIGHEST_PROTOCOL)

    n_assumption_true_rxns # 402536


100%|██████████| 409035/409035 [41:57<00:00, 162.50it/s] 
