In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from IPython.display import display

data_path = '../Molecular_Properties'

In [3]:
files_names = !ls $data_path/*.csv
files_names

['../Molecular_Properties/dipole_moments.csv',
 '../Molecular_Properties/magnetic_shielding_tensors.csv',
 '../Molecular_Properties/mulliken_charges.csv',
 '../Molecular_Properties/potential_energy.csv',
 '../Molecular_Properties/sample_submission.csv',
 '../Molecular_Properties/scalar_coupling_contributions.csv',
 '../Molecular_Properties/structures.csv',
 '../Molecular_Properties/test.csv',
 '../Molecular_Properties/train.csv']

In [4]:
data_dict = {}

for name in files_names:
    data_dict[name.split('/')[-1][:-4]] = pd.read_csv(name)

In [5]:
df_complete = data_dict['train'].copy()
df_complete = df_complete.join(data_dict['potential_energy'].set_index('molecule_name'), on='molecule_name')
df_complete = df_complete.join(data_dict['dipole_moments'].set_index('molecule_name'), on='molecule_name', lsuffix='dipole_moments_')
df_complete = df_complete.join(data_dict['magnetic_shielding_tensors'].set_index(['molecule_name', 'atom_index']), on=['molecule_name', 'atom_index_0'], lsuffix='_atom0')
df_complete = df_complete.join(data_dict['magnetic_shielding_tensors'].set_index(['molecule_name', 'atom_index']), on=['molecule_name', 'atom_index_1'], lsuffix='_atom1')
df_complete = df_complete.join(data_dict['mulliken_charges'].set_index(['molecule_name', 'atom_index']), on=['molecule_name', 'atom_index_0'], lsuffix='_atom0')
df_complete = df_complete.join(data_dict['mulliken_charges'].set_index(['molecule_name', 'atom_index']), on=['molecule_name', 'atom_index_1'], lsuffix='_atom1')
df_complete = df_complete.join(data_dict['scalar_coupling_contributions'].set_index(['molecule_name', 'atom_index_0', 'atom_index_1']), on=['molecule_name', 'atom_index_0', 'atom_index_1'], rsuffix='_scc')
df_complete = df_complete.join(data_dict['structures'].set_index(['molecule_name', 'atom_index']), on=['molecule_name', 'atom_index_0'], lsuffix='_atom0_structure')
df_complete = df_complete.join(data_dict['structures'].set_index(['molecule_name', 'atom_index']), on=['molecule_name', 'atom_index_1'], lsuffix='_atom1_structure')

In [8]:
df= df_complete.drop(['id'], axis=1)
len(df)

4658147

In [7]:
df_subsample=  df.sample(frac=0.1, random_state=1)
len(df_subsample)

465815

In [13]:
features= df_subsample[['fc', 'pso', 'sd', 'dso', 'mulliken_charge_atom1', 'YY', 'XX_atom1', 'XX', 'ZZ', 'YY_atom1',
                       'mulliken_charge', 'potential_energy', 'ZZ_atom1', 'Z']]

In [14]:
features

Unnamed: 0,fc,pso,sd,dso,mulliken_charge_atom1,YY,XX_atom1,XX,ZZ,YY_atom1,mulliken_charge,potential_energy,ZZ_atom1,Z
2557043,2.348940,-0.033082,-0.034241,0.092030,0.232601,180.3960,29.5177,181.4930,182.7930,33.5564,-0.508960,-435.3620,25.1699,1.5065
1571490,-9.521030,1.903460,0.374256,-2.072480,0.116264,30.6437,26.8083,28.6370,28.7841,26.9144,0.124500,-421.9332,33.7409,1.1443
3815279,2.822460,0.307475,-0.062896,-0.553075,0.114498,102.3440,28.5014,94.3560,120.7070,30.7883,0.052715,-459.0387,22.9905,2.3221
3327074,0.365051,0.194738,-0.024773,-0.268798,0.096038,97.0485,30.2074,104.8590,105.2960,23.2356,0.270692,-460.3333,25.4791,1.5971
842927,7.688050,0.437421,0.111036,-0.451560,0.111746,27.0696,35.8985,35.8979,29.9001,27.0690,0.111749,-418.1772,29.9001,-0.0005
2140407,3.255910,0.329974,-0.018175,-0.392163,0.084225,102.3000,31.1637,131.5510,73.7504,31.2623,0.409009,-384.7619,26.2865,-1.2719
3680350,6.403150,0.204172,0.008111,-0.383134,0.254727,-73.1443,27.3955,8.2437,68.7723,31.7620,0.163884,-437.9416,33.3706,1.5549
2138509,103.204000,-0.271135,0.326960,1.176050,0.094574,111.7740,27.5733,112.7430,107.9920,26.5807,-0.195095,-400.7982,28.6064,1.1448
764496,-14.941700,2.464970,0.364028,-2.744870,0.145777,28.1549,30.8524,30.7456,30.3039,27.3038,0.145833,-364.9921,31.0117,-0.0023
3520100,6.724470,0.175627,-0.091349,-0.411963,0.093734,103.4430,28.2487,-6.7723,93.6850,34.1253,-0.061302,-388.5033,26.3072,0.6673


In [15]:
target= df_subsample[['scalar_coupling_constant']]
target

Unnamed: 0,scalar_coupling_constant
2557043,2.373650
1571490,-9.315790
3815279,2.513960
3327074,0.266218
842927,7.784940
2140407,3.175540
3680350,6.232300
2138509,104.436000
764496,-14.857600
3520100,6.396780
