In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pprint import pprint

from dataclasses import dataclass
import ase
import io
from multiprocessing import Pool

In [3]:
structures = pd.read_feather('../data/structures.feather')
labelled = pd.read_feather('../data/train.feather')
unlabelled = pd.read_feather('../data/test.feather')

In [4]:
methane_name = 'dsgdb9nsd_000001'
methane = structures[structures.molecule_name == methane_name]

ethanol_name = 'dsgdb9nsd_000014'
ethanol = structures[structures.molecule_name == ethanol_name]

In [5]:
from chemistry import Bond, Atom, Molecule

In [6]:
def make_molecule(name, molecule_df):
    atoms = []
    for i, atom in molecule_df.iterrows():
        atoms.append(Atom(atom.atom, (atom.x, atom.y, atom.z)))
        
    return Molecule(name, atoms)

molecule_name = 'dsgdb9nsd_005157'
molecule = make_molecule(molecule_name, structures[structures.molecule_name == molecule_name])
pprint(molecule)
print(compute_path(molecule, 8, 0))

Name: dsgdb9nsd_005157
Atoms:
  N 0: [-0.04380067  0.91265517 -0.92835289]
  C 1: [-0.02025951  0.016512   -0.02907792]
  N 2: [-1.16610885 -0.49266312  0.5532214 ]
  C 3: [-2.44006681  0.01167247  0.31056502]
  C 4: [-3.47235942 -0.43425712 -0.45213547]
  C 5: [-4.53647614  0.50771636 -0.28167379]
  C 6: [-4.06397057  1.45375311  0.57209116]
  O 7: [-2.79333162  1.17002368  0.94911319]
  H 8: [ 0.90126568  1.12271273 -1.2403121 ]
  H 9: [ 0.87793076 -0.46782482  0.37502316]
  H 10: [-1.03735054 -1.09128964  1.35376251]
  H 11: [-3.4642067  -1.31988931 -1.06688321]
  H 12: [-5.51368046  0.4836219  -0.7374481 ]
  H 13: [-4.48651171  2.35242915  0.99058348]
Bonds:
  C(1) - N(0)
  N(0) - H(8)
  C(1) - H(9)
  N(2) - H(10)
  C(3) - C(4)
  C(4) - C(5)
  C(4) - H(11)
  C(5) - C(6)
  C(5) - H(12)
  C(6) - H(13)




NameError: name 'compute_path' is not defined

In [7]:
from chemistry import compute_path
molecule = make_molecule('ethanol', ethanol)
compute_path(molecule, 0, 8)

[0, 1, 2, 8]

In [8]:
def read_molecule(molecule_name):
    global structures
    molecule_df = structures[structures.molecule_name == molecule_name]
    molecule = make_molecule(molecule_name, molecule_df)
    
#     molecule_df = labelled[labelled.molecule_name == molecule.name]
#     for _, row in molecule_df.iterrows():
#         if compute_path(molecule, row.atom_index_0, row.atom_index_1) is None:
#             raise Exception(f'{molecule.name} {row.atom_index_0} {row.atom_index_1} failed')
            
    return molecule

def read_molecules(data):
    molecule_names = data.molecule_name.unique()
    with Pool(8) as pool:
        molecules = pool.imap_unordered(read_molecule, molecule_names, 1000)
        molecule_map = {molecule.name:molecule for molecule in molecules}
     
    return molecule_map
        
molecules = read_molecules(structures)

In [9]:
len(molecules), len(structures.molecule_name.unique())

(130775, 130775)

In [10]:
import pickle

with open('../data/molecules.pickle', 'wb') as f:
    pickle.dump(molecules, f)