# Finding Molecular Combinations

In [45]:
from itertools import combinations_with_replacement, product
import numpy as np
import pandas as pd
import os
import time

### Import dataset

In [46]:
mypath = os.path.dirname(os.path.abspath('num_molecules.db'))
molecules = {}
with open(mypath+'/data/num_molecules.db', 'r') as f:
    for line in f:
        keys = line.split(':')[0]
        keys = tuple(map(int, keys.split(',')))
        values = line.split(':')[1].strip('\n')
        values = list(values.split(','))
        molecules[keys] = values

In [47]:
# Need to add CO2, CO, O2, H2, H2O
#molecules[(1, 2, 0)] = molecules[(1, 2, 0)].append('C(=O)=O')
molecules[(1, 2, 0)] = ['C(=O)=O'] # CO2
molecules[(0, 1, 2)] = ['O'] # H2O
molecules[(1, 1, 0)] = ['[C-]#[O+]'] # CO
molecules[(0, 0, 2)] = ['[HH]'] # H2
molecules[(0, 2, 0)] = ['O=O'] # O2
molecules

{(0, 0, 0): [''],
 (0, 1, 0): [''],
 (0, 2, 0): ['O=O'],
 (1, 0, 1): [''],
 (1, 0, 2): [''],
 (1, 0, 3): [''],
 (1, 0, 4): ['C'],
 (1, 1, 1): [''],
 (1, 1, 2): ['C=O'],
 (1, 1, 3): [''],
 (1, 1, 4): ['CO'],
 (1, 2, 1): [''],
 (1, 2, 2): [''],
 (1, 2, 3): [''],
 (1, 2, 4): [''],
 (2, 0, 2): ['C#C'],
 (2, 0, 3): [''],
 (2, 0, 4): [''],
 (2, 0, 5): [''],
 (2, 0, 6): ['CC'],
 (2, 0, 7): [''],
 (2, 0, 8): [''],
 (2, 1, 2): [''],
 (2, 1, 3): [''],
 (2, 1, 4): ['CC=O', 'C1CO1'],
 (2, 1, 5): [''],
 (2, 1, 6): ['CCO', 'COC'],
 (2, 1, 7): [''],
 (2, 1, 8): [''],
 (2, 2, 2): ['O=CC=O'],
 (2, 2, 3): [''],
 (2, 2, 4): ['COC=O', 'O=CCO'],
 (2, 2, 5): [''],
 (2, 2, 6): ['OCCO'],
 (2, 2, 7): [''],
 (2, 2, 8): [''],
 (3, 0, 3): [''],
 (3, 0, 4): ['C#CC'],
 (3, 0, 5): [''],
 (3, 0, 6): ['C1CC1'],
 (3, 0, 7): [''],
 (3, 0, 8): ['CCC'],
 (3, 0, 9): [''],
 (3, 0, 10): [''],
 (3, 0, 11): [''],
 (3, 1, 3): [''],
 (3, 1, 4): ['C#CCO'],
 (3, 1, 5): [''],
 (3, 1, 6): ['OC1CC1', 'C1COC1', 'CC1CO1', 'CC(C)=O', 'C

### Function

In [48]:
# Given a molecule, find all sets of molecules that could make up the molecule
# Pre: Input allSet, a dictionary containing all molecules to make combination, and molecule, a target to
# make combination of.
# Post: Return a 2d array of all the combinatory sets
def findCombinations(allSet, molecule):
    
    # Finds the tuple of the given molecule
    # Post: Return the molecule's tuple, return 'Not found' if tuple is not found in given set
    def findTuple():
        for key in allSet:
            arr = allSet.get(key)
            for curr in arr:
                if curr == molecule:
                    return key
        return 'Not found'
    
    # Find all combinations that makes up a given tuple
    # Pre: Input a target tuple, and a list of all possible tuples
    # Post: Return a 2D list of all tuple combinations that sums to the target's tuple
    def findTupleCombinations(target, possibleMolecules):
        
        # Checks if the mass is conserved by summing each atom count
        # Pre: Given a tuple combination and a target tuple
        # Post: Return true if mass is conserved, return false otherwise
        def checkMassConserve(comb, target):
            # make a sum of the comb in tuple
            check = [0, 0, 0]
            for curr in comb:
                for i in range(len(curr)):
                    check[i] += curr[i]
            return check == list(target)
        
        result = []
        targetSum = sum(target)
        # find all combinations with the given pieces for each size of combination (min = 1, max = sum(tuple values))
        for size in range(2, targetSum):
            combSet = list(combinations_with_replacement(possibleMolecules, size))
            for comb in combSet:
                comb = list(comb)
                # Check if mass is conserved
                if checkMassConserve(comb, target):
                    result.append(comb)
        return result
    
    # Convert all tuples into smile strings
    # Pre: Input a 2d list of tuple combinations
    # Post: Output a 2d list of smile combinations
    def tupleCombToSmileComb(combSet):
        result = []
        for tupleComb in combSet:
            smileComb = []
            for i in range(len(tupleComb)):
                smileArr = allSet[tupleComb[i]]
                smileComb.append(smileArr)
            smileComb = product(*smileComb)
            comb = []
            for curr in smileComb:
                result.append(list(curr))
            result.append(comb)
        return result
    
    # Search molecule's tuple
    atomCount = findTuple()
    if atomCount == 'Not found':
        raise Exception('Molecule not found in given database')
    possibleTuples = list(allSet.keys())
    
    # Remove tuples with empty value
    remove = []
    for curr in possibleTuples:
        if allSet[curr] == ['']:
            remove.append(curr)
            
    # Put all tuple that have C, O or H exceeding the given tuple in remove list
    for key in possibleTuples:
        for i in range(len(key)):
            if key[i] > atomCount[i]:
                remove.append(key)
                
    # Put isomers remove list, no need to make combinations with them. Add them back in later in result
    remove.append(atomCount)
    
    # Actual removing step
    for curr in remove:
        if curr in possibleTuples:
            possibleTuples.remove(curr)
            
    # CHECKING POSSIBLE TUPLES: FOR TESTING
    print('Possible Tuples:')
    print(possibleTuples)
    print()
    
    
    
    # Find all combination in tuple
    combSet = findTupleCombinations(atomCount, possibleTuples)
    
    # Convert tuple to smile strings
    smileCombSet = tupleCombToSmileComb(combSet)
    smileCombSet = list(filter(lambda a: a != [], smileCombSet)) # Filter out werid empty list made in product()
    
    # Add isomers into smileCombSet
    for smile in allSet[atomCount]:
        smileCombSet.append([smile])
    
    return smileCombSet
    

### Example calls

In [49]:
start_time = time.clock()

possibleTuples = findCombinations(molecules, 'O=CCO')
print(possibleTuples) # (2, 2, 4)

print()
print('Number of combinations:', len(possibleTuples))

print()
print('Took', time.clock() - start_time, "seconds")

Possible Tuples:
[(0, 2, 0), (1, 0, 4), (1, 1, 2), (1, 1, 4), (2, 0, 2), (2, 1, 4), (2, 2, 2), (1, 2, 0), (0, 1, 2), (1, 1, 0), (0, 0, 2)]

[['C', 'C(=O)=O'], ['C=O', 'C=O'], ['CO', '[C-]#[O+]'], ['O=CC=O', '[HH]'], ['O=O', 'C#C', '[HH]'], ['C=O', '[C-]#[O+]', '[HH]'], ['[C-]#[O+]', '[C-]#[O+]', '[HH]', '[HH]'], ['COC=O'], ['O=CCO']]

Number of combinations: 9

Took 0.1186879999999988 seconds


In [50]:
start_time = time.clock()

possibleTuples = findCombinations(molecules, 'CCC')
print(possibleTuples) # (3, 0, 8)

print()
print('Number of combinations:', len(possibleTuples))

print()
print(time.clock() - start_time, "seconds")

Possible Tuples:
[(1, 0, 4), (2, 0, 2), (2, 0, 6), (3, 0, 4), (3, 0, 6), (0, 0, 2)]

[['C1CC1', '[HH]'], ['C', 'C#C', '[HH]'], ['C#CC', '[HH]', '[HH]'], ['CCC']]

Number of combinations: 4

0.048496000000000095 seconds


In [51]:
start_time = time.clock()

possibleTuples = findCombinations(molecules, 'C#CCC')
print(possibleTuples) #  (4, 0, 6)

print()
print('Number of combinations:', len(possibleTuples))

print()
print(time.clock() - start_time, "seconds")

Possible Tuples:
[(1, 0, 4), (2, 0, 2), (2, 0, 6), (3, 0, 4), (3, 0, 6), (0, 0, 2)]

[['C#C', 'C#C', '[HH]'], ['C#CCC'], ['CC#CC']]

Number of combinations: 3

0.024675000000002 seconds


In [52]:
start_time = time.clock()

possibleTuples = findCombinations(molecules, 'CC#CC=O')
print(possibleTuples) #  (4, 1, 4)

print()
print('Number of combinations:', len(possibleTuples))

print()
print(time.clock() - start_time, "seconds")

Possible Tuples:
[(1, 0, 4), (1, 1, 2), (1, 1, 4), (2, 0, 2), (2, 1, 4), (3, 0, 4), (3, 1, 4), (0, 1, 2), (1, 1, 0), (0, 0, 2)]

[['C#CC', '[C-]#[O+]'], ['CC#CC=O'], ['c1ccoc1'], ['C#CCC=O'], ['C#CC(C)=O'], ['C#CC1CO1']]

Number of combinations: 6

0.17190199999999578 seconds


In [27]:
start_time = time.clock()

possibleTuples = findCombinations(molecules, 'C#CCC')
print(possibleTuples) #  (4, 0, 6)

print()
print('Number of combinations:', len(possibleTuples))

print()
print(time.clock() - start_time, "seconds")

Possible Tuples:
[(1, 0, 4), (2, 0, 2), (2, 0, 6), (3, 0, 4), (3, 0, 6), (0, 0, 2)]

[['C#C', 'C#C', '[HH]'], ['C#CCC'], ['CC#CC']]

Number of combinations: 3

0.02632600000000007 seconds


In [28]:
# Takes too long

start_time = time.clock()

possibleTuples = findCombinations(molecules, 'CC1COC1')
print(possibleTuples) #  (4, 1, 8)

print()
print(time.clock() - start_time, "seconds")

Possible Tuples:
[(1, 0, 4), (1, 1, 2), (1, 1, 4), (2, 0, 2), (2, 0, 6), (2, 1, 4), (2, 1, 6), (3, 0, 4), (3, 0, 6), (3, 0, 8), (3, 1, 4), (3, 1, 6), (3, 1, 8), (4, 0, 6), (4, 0, 8), (4, 1, 4), (4, 1, 6), (0, 1, 2), (1, 1, 0), (0, 0, 2)]



KeyboardInterrupt: 

In [None]:
# Takes too long

start_time = time.clock()

combinations = findCombinations(molecules, 'O=CC1(O)CC1')
print(combinations) #  (4, 2, 6)

print()
print('Number of combinations:', len(combinations))

print()
print(time.clock() - start_time, "seconds")

In [18]:
# Takes too long

start_time = time.clock()

combinations = findCombinations(molecules, '[C-]#[O+]')
print(combinations) #  (4, 2, 6)

print()
print('Number of combinations:', len(combinations))

print()
print(time.clock() - start_time, "seconds")

Possible Tuples:
[]

[['[C-]#[O+]']]

Number of combinations: 1

0.0007280000000000619 seconds
