This is a copy of my public kaggle kernel: https://www.kaggle.com/joatom/nearest-molecular-neighbors
# kNN features
I'd like to share some of the features of my current kernel with you. 
I'd be happy to read your comments or suggestions (it's still a py/pandas beginners code).

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

import os
import warnings  
print(os.listdir("../input"))

['dipole_moments.csv', 'potential_energy.csv', 'magnetic_shielding_tensors.csv', 'structures.csv', 'scalar_coupling_contributions.csv', 'structures', 'sample_submission.csv', 'mulliken_charges.csv', 'test.csv', 'train.csv']


# Structures data

In [2]:
structures = pd.read_csv('../input/structures.csv') 

# uncomment for debugging 
structures = structures.head(n=100)

structures.head(n=10)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602


The nn_feateres() function extracts the atom names, distances and coordinates of k nearest neighbors. I'm using 4 neighbors.
Because my primary skills are SQL and I'm fairly new to py and pandas I've tried to build the function to use it in a SQL-like "..nn_features() over(partition by molecule_name)" kind of manner. I therefor used pd.transform to pass the indexes of a molecule to the function and look up its atoms. If you know of a better or faster way please let me know.

In [3]:

def nn_features(l):
    ''' l: indexed pd.Series of a molecule '''
    
    # number of nearest neighbors +1
    k = 4+1
    
    # lookup coordinates of atoms in molecule 
    x=np.array(structures.loc[l.index,'x'])
    y=np.array(structures.loc[l.index,'y'])
    z=np.array(structures.loc[l.index,'z'])
    coord = np.append(np.append(x,y),z).reshape((l.size,3),order='F')
    
    # NN calculations
    nbrs = NearestNeighbors(n_neighbors=min(len(coord),k), algorithm='ball_tree').fit(coord)
    distances, indices = nbrs.kneighbors(coord)
    
    
    if indices.shape != (1,1):
        # PCA - not relevant for nn, but nice feature anyway
        pca = PCA(n_components=2)
        p=pca.fit_transform(coord)
        
        # NN id and NN distance
        atm = np.pad(indices[:,1:l.size],((0,0),(0, max(0, k-l.size))), 'constant', constant_values=(999, 999))
        dst = np.pad(distances[:,1:l.size], ((0,0),(0,max(0,k-l.size))), 'constant', constant_values=(0, 0))
        
        # LookUps for atom name and x,y,z, default value N/A or 0
        lu = np.append(np.array(structures.loc[l.index,'atom']),np.array('N/A'))
        lu_x = np.append(np.array(structures.loc[l.index,'x']),np.array(0))
        lu_y = np.append(np.array(structures.loc[l.index,'y']),np.array(0))
        lu_z = np.append(np.array(structures.loc[l.index,'z']),np.array(0))
        
        # for each nn look up coordinates and atom name 
        nn_x = np.take(lu_x, atm, mode = 'clip') 
        nn_y = np.take(lu_y, atm, mode = 'clip') 
        nn_z = np.take(lu_z, atm, mode = 'clip') 
        atm = np.take(lu, atm, mode = 'clip')
    else: 
        # in case the molecule contains only 1 atom (e.g. while debugging a small dataset)
        p = np.ones((1, 2))*(999)
        atm = np.ones((1, max(0, k-l.size)))*(999) 
        dst = np.ones((1, max(0, k-l.size)))*(999)
        nn_x = np.ones((1, max(0, k-l.size)))*(999)
        nn_y = np.ones((1, max(0, k-l.size)))*(999)
        nn_z = np.ones((1, max(0, k-l.size)))*(999)
    
    # put together atom names, distances, coordinates of nnearest neighbors and pca
    out = np.append(np.append(np.append(np.append(np.append(atm,dst,axis=1),nn_x, axis=1),nn_y, axis=1),nn_z, axis=1) ,p, axis=1)
    
    return [i for i in out]



For the whole *structures* dataset it takes about 12 minutes to calculate the features of 4 nearest neighbors.

In [4]:
%%time

warnings.filterwarnings('ignore')

structures['nearestn'] = structures.groupby('molecule_name')['x'].transform(nn_features)

structures.head(n=10)
#11mi 12s

CPU times: user 224 ms, sys: 12 ms, total: 236 ms
Wall time: 346 ms


Unnamed: 0,molecule_name,atom_index,atom,x,y,z,nearestn
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,"[H, H, H, H, 1.0919463791331034, 1.09194754111..."
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,"[C, H, H, H, 1.0919530596119005, 1.78311975603..."
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,"[C, H, H, H, 1.0919516185813627, 1.78311975603..."
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,"[C, H, H, H, 1.0919463791331034, 1.78314749640..."
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,"[C, H, H, H, 1.0919475411120265, 1.78314787222..."
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,"[H, H, H, N/A, 1.0171871876583656, 1.017190026..."
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,"[N, H, H, N/A, 1.0171900266331495, 1.618522750..."
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,"[N, H, H, N/A, 1.0171871876583656, 1.618522750..."
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,"[N, H, H, N/A, 1.0172079061723844, 1.618705586..."
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,"[H, H, N/A, N/A, 0.9621068124142939, 0.9621068..."


Split the list of nn features. (30 sec)

In [5]:
%%time

# atom name of nn
structures['nn_1'] = structures['nearestn'].apply(lambda x: x[0])
structures['nn_2'] = structures['nearestn'].apply(lambda x: x[1])
structures['nn_3'] = structures['nearestn'].apply(lambda x: x[2])
structures['nn_4'] = structures['nearestn'].apply(lambda x: x[3])

# eucledian distances to nn
structures['nn_1_dist'] = structures['nearestn'].apply(lambda x: x[4])
structures['nn_2_dist'] = structures['nearestn'].apply(lambda x: x[5])
structures['nn_3_dist'] = structures['nearestn'].apply(lambda x: x[6])
structures['nn_4_dist'] = structures['nearestn'].apply(lambda x: x[7])

# x,y,z distances to nn
structures['nn_dx_1'] = structures['nearestn'].apply(lambda x: x[8])  - structures['x']
structures['nn_dx_2'] = structures['nearestn'].apply(lambda x: x[9])  - structures['x']
structures['nn_dx_3'] = structures['nearestn'].apply(lambda x: x[10])  - structures['x']
structures['nn_dx_4'] = structures['nearestn'].apply(lambda x: x[11])  - structures['x']

structures['nn_dy_1'] = structures['nearestn'].apply(lambda x: x[12])  - structures['y']
structures['nn_dy_2'] = structures['nearestn'].apply(lambda x: x[13])  - structures['y']
structures['nn_dy_3'] = structures['nearestn'].apply(lambda x: x[14])  - structures['y']
structures['nn_dy_4'] = structures['nearestn'].apply(lambda x: x[15])  - structures['y']

structures['nn_dz_1'] = structures['nearestn'].apply(lambda x: x[16])  - structures['z']
structures['nn_dz_2'] = structures['nearestn'].apply(lambda x: x[17])  - structures['z']
structures['nn_dz_3'] = structures['nearestn'].apply(lambda x: x[18])  - structures['z']
structures['nn_dz_4'] = structures['nearestn'].apply(lambda x: x[19])  - structures['z']

# 2 dim pca
structures['pca_x'] = structures['nearestn'].apply(lambda x: x[20])
structures['pca_y'] = structures['nearestn'].apply(lambda x: x[21])

structures = structures.drop(columns='nearestn',axis=0)
structures.head(n=10)

CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 33.5 ms


Unnamed: 0,molecule_name,atom_index,atom,x,y,z,nn_1,nn_2,nn_3,nn_4,nn_1_dist,nn_2_dist,nn_3_dist,nn_4_dist,nn_dx_1,nn_dx_2,nn_dx_3,nn_dx_4,nn_dy_1,nn_dy_2,nn_dy_3,nn_dy_4,nn_dz_1,nn_dz_2,nn_dz_3,nn_dz_4,pca_x,pca_y
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,H,H,H,H,1.091946,1.091948,1.091952,1.091953,-0.528117,-0.511115,1.024429,0.014849,0.361722,0.352128,0.377947,-1.091835,-0.884645,0.898396,-0.007724,-0.006025,-1.166623e-05,-1.292509e-07
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,C,H,H,H,1.091953,1.78312,1.783147,1.783157,-0.014849,1.00958,-0.542965,-0.525964,1.091835,1.469782,1.453558,1.443964,0.006025,-0.0017,-0.87862,0.904421,0.6178603,-0.2616935
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,C,H,H,H,1.091952,1.78312,1.783148,1.783158,-1.024429,-1.00958,-1.535544,-1.552546,-0.377947,-1.469782,-0.025819,-0.016225,0.007724,0.0017,0.906121,-0.87692,0.6428857,0.2485599
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,C,H,H,H,1.091946,1.783147,1.783148,1.783158,0.528117,0.542965,0.017001,1.552546,-0.361722,-1.453558,-0.009594,0.016225,0.884645,0.87862,1.783041,0.87692,-0.6363219,-0.8476758
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,C,H,H,H,1.091948,1.783148,1.783148,1.783157,0.511115,-0.017001,1.535544,0.525964,-0.352128,0.009594,0.025819,-1.443964,-0.898396,-1.783041,-0.906121,-0.904421,-0.6244125,0.8608096
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,H,H,H,,1.017187,1.01719,1.017208,0.0,0.956215,0.057684,-0.479852,0.040426,0.334637,-1.011563,0.319424,-1.024108,-0.091322,-0.089941,-0.838106,-0.062564,4.726442e-05,-2.000778e-07
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,N,H,H,,1.01719,1.618523,1.61871,0.0,-0.057684,0.898532,-0.537535,-0.017257,1.011563,1.3462,1.330987,-0.012545,0.089941,-0.001381,-0.748165,0.027377,-0.4755047,-0.8044777
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,N,H,H,,1.017187,1.618523,1.618706,0.0,-0.956215,-0.898532,-1.436067,-0.915789,-0.334637,-1.3462,-0.015213,-1.358745,0.091322,0.001381,-0.746785,0.028758,-0.4590756,0.8139617
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,N,H,H,,1.017208,1.618706,1.61871,0.0,0.479852,1.436067,0.537535,0.520278,-0.319424,0.015213,-1.330987,-1.343532,0.838106,0.746785,0.748165,0.775543,0.9345331,-0.009483811
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,H,H,,,0.962107,0.962107,0.0,0.0,0.099127,0.906151,0.03436,0.03436,-0.956967,0.323253,-0.97754,-0.97754,-0.006067,-0.006908,-0.007602,-0.007602,-4.240387e-11,0.3961406
