### Data loading and overview
This bookm serves to load the data and get a understanding for it. This book will only consider data from the following csv files: train, test and structures


In [3]:
import os

input_folder = './input'

os.listdir(input_folder)

['dipole_moments.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'sample_submission.csv',
 'scalar_coupling_contributions.csv',
 'structures.csv',
 'structures.zip',
 'test.csv',
 'train.csv']

In [5]:
import pandas as pd

train = pd.read_csv(f'{input_folder}/train.csv')
test = pd.read_csv(f'{input_folder}/test.csv')
structures = pd.read_csv(f'{input_folder}/structures.csv')

In [8]:
f'Train shape: {train.shape}'

'Train shape: (4658147, 6)'

In [13]:
train.head(5)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [9]:
f'Test shape: {test.shape}'

'Test shape: (2505542, 5)'

In [12]:
test.head(5)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4658147,dsgdb9nsd_000004,2,0,2JHC
1,4658148,dsgdb9nsd_000004,2,1,1JHC
2,4658149,dsgdb9nsd_000004,2,3,3JHH
3,4658150,dsgdb9nsd_000004,3,0,1JHC
4,4658151,dsgdb9nsd_000004,3,1,2JHC


In [10]:
f'Structures shape: {structures.shape}'

'Structures shape: (2358657, 6)'

In [14]:
structures.head(5)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [23]:
print(f"There are {train['molecule_name'].nunique()} distinct molecules in train data.")
print(f"There are {test['molecule_name'].nunique()} distinct molecules in test data.")
print(f"There are {max(train['atom_index_0'].nunique(), train['atom_index_1'].nunique())} unique atoms.")
print(f"There are {train['type'].nunique()} unique types.")

There are 85003 distinct molecules in train data.
There are 45772 distinct molecules in test data.
There are 29 unique atoms.
There are 8 unique types.


In [36]:
train_molecule_set = set(train['molecule_name'].unique())
test_molecule_set = set(test['molecule_name'].unique())
struct_molecule_set = set(structures['molecule_name'].unique())

molecule_overlap_test_train = train_molecule_set.intersection(test_molecule_set)
print('Molecule overlap bewteen training and test data sets: {}'.format(len(molecule_overlap_test_train)))

molecule_overlap_train_in_struct = train_molecule_set.intersection(struct_molecule_set)
print('Train molecules described in structures file: {}%'.format(len(molecule_overlap_train_in_struct)*100/len(train_molecule_set)))

molecule_overlap_test_in_struct = test_molecule_set.intersection(struct_molecule_set)
print('Test molecules described in structures file: {}%'.format(len(molecule_overlap_test_in_struct)*100/len(test_molecule_set)))

Molecule overlap bewteen training and test data sets: 0
Train molecules described in structures file: 100.0%
Test molecules described in structures file: 100.0%
