# Exploring dataset

## Import datatset 

Major csv files from this competiton are train.csv, test.csv, sample_submission.csv, structures.csv. Thus, we will explore these files first.
    - train.csv: the training set
    - test.csv: the testing set
    - sample_submission.csv: a sample submission file in the correct format
    - structures.csv: molecular structure file, X Y Z
    

In [50]:
# import basic packages for ML
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt


In [65]:
# read all the major csv files
folder_name = 'champs-scalar-coupling/'
train = pd.read_csv(folder_name + 'train.csv')
test = pd.read_csv(folder_name + 'test.csv')
sample_submission = pd.read_csv(folder_name + 'sample_submission.csv')
structures = pd.read_csv(folder_name + 'structures.csv')

Additional dataset contains the following csv files:
    - dipole_moments.csv: contains the molecular electric dipole moments
    - magnetic_shielding_tensors.csv: contains the magnetic shielding tensors for all atoms in the molecules
    - mulliken_charges.csv: contains the mulliken charges for all atoms in the molecules.
    - potential_energy.csv: contains the potential energy of the molecules
    - scalar_coupling_contributions.csv: contains four types of contributions

In [66]:
# read additional files
dipole_moments = pd.read_csv(folder_name + 'dipole_moments.csv')
mag_shield_tensors = pd.read_csv(folder_name +
                                 'magnetic_shielding_tensors.csv')
mull_charges = pd.read_csv(folder_name + 'mulliken_charges.csv')
pot_energy = pd.read_csv(folder_name + 'potential_energy.csv')
sca_cou_contri = pd.read_csv(folder_name + 'scalar_coupling_contributions.csv')

## Merge datasets

In [67]:
# drop id column in train df
train.drop('id', axis=1, inplace=True)

In [68]:
# create distance features in dipole_moments
dipole_moments['distance'] = dipole_moments['X']+dipole_moments['Y']+dipole_moments['Z']
dipole_moments['distance_2'] = dipole_moments['distance'].apply(lambda x: x**2)

In [69]:
# merge train/test and structures with atom index
def map_atom_info(df, other_df, other_df_name, atom_idx):
    # merge train and structures
    if other_df_name == 'structures':
        df = pd.merge(df,
                      other_df,
                      how='left',
                      left_on=['molecule_name', f'atom_index_{atom_idx}'],
                      right_on=['molecule_name', 'atom_index'])
        
        df = df.drop('atom_index', axis=1)
        
        df = df.rename(
            columns={
                'atom': f'atom_{atom_idx}',
                'x': f'x_{atom_idx}',
                'y': f'y_{atom_idx}',
                'z': f'z_{atom_idx}'
            })
    # merge train and mag_shield_tensors
    elif other_df_name == 'mag_shield_tensors':
        other_df = other_df.drop(['YX', 'ZX', 'ZY'], axis=1)
        df = pd.merge(df,
                      other_df,
                      how='left',
                      left_on=['molecule_name', f'atom_index_{atom_idx}'],
                      right_on=['molecule_name', 'atom_index'])
        df.drop('atom_index', axis=1, inplace=True)
        df = df.rename(
            columns={
                'atom': f'atom_{atom_idx}',
                'XX': f'XX_{atom_idx}',
                'YY': f'YY_{atom_idx}',
                'ZZ': f'ZZ_{atom_idx}',
                'XY': f'XY_{atom_idx}',
                'YZ': f'YZ_{atom_idx}',
            })
    # merge train and mulliken_charges
    elif other_df_name == 'mull_charges':
        df = pd.merge(df,
                     other_df,
                     how='left',
                     left_on=['molecule_name', f'atom_index_{atom_idx}'],
                     right_on=['molecule_name', 'atom_index'])
        df.drop('atom_index', axis=1, inplace=True)
        df = df.rename(
            columns = {'mulliken_charge': f'mulliken_charge_{atom_idx}'}
        )
    
    else:
        print(f'{other_df} dataframe is not found')
        
    return df


In [70]:
# start merging
train = map_atom_info(train, structures, 'structures', 0)
train = map_atom_info(train, structures, 'structures', 1)
train = map_atom_info(train, mag_shield_tensors, 'mag_shield_tensors', 0)
train = map_atom_info(train, mag_shield_tensors, 'mag_shield_tensors', 1)
train = map_atom_info(train, mull_charges, 'mull_charges', 0)
train = map_atom_info(train, mull_charges, 'mull_charges', 0)

test = map_atom_info(test, structures, 'structures', 0)
test = map_atom_info(test, structures, 'structures', 1)
test = map_atom_info(test, mag_shield_tensors, 'mag_shield_tensors', 0)
test = map_atom_info(test, mag_shield_tensors, 'mag_shield_tensors', 1)
test = map_atom_info(test, mull_charges, 'mull_charges', 0)
test = map_atom_info(test, mull_charges, 'mull_charges', 1)

In [71]:
# merge train/test and structures without atom index
def merge_other_df(df, other_df, other_df_name):
    # merge train and pot_energy
    if other_df_name == 'pot_energy':
        df = pd.merge(df,
                     other_df,
                     how='left',
                     on='molecule_name')
    # merge train and dipole_moments
    elif other_df_name == 'dipole_moments':
        df = pd.merge(df,
                     other_df,
                     how='left',
                     on='molecule_name')
    else:
        print(f'{other_df} dataframe is not found')
        
    return df

In [72]:
# continue merging
train = merge_other_df(train, pot_energy, 'pot_energy')
train = merge_other_df(train, dipole_moments, 'dipole_moments')

test = merge_other_df(test, pot_energy, 'pot_energy')
test = merge_other_df(test, dipole_moments, 'dipole_moments')

# Data preprocessing

## exploring data insight

In [79]:
train.corr()['scalar_coupling_constant']

atom_index_0                0.018839
atom_index_1               -0.218989
scalar_coupling_constant    1.000000
x_0                        -0.006015
y_0                        -0.013119
z_0                         0.004807
x_1                         0.005132
y_1                         0.021189
z_1                        -0.007792
XX_0                       -0.024919
XY_0                       -0.000947
YY_0                       -0.042562
XZ_x                       -0.002161
YZ_0                        0.000404
ZZ_0                       -0.054809
XX_1                        0.223554
XY_1                        0.000740
YY_1                        0.221592
XZ_y                        0.000188
YZ_1                       -0.001670
ZZ_1                        0.237707
mulliken_charge_0          -0.025444
mulliken_charge_0          -0.025444
potential_energy           -0.017187
X                           0.000101
Y                           0.000315
Z                           0.002648
d

In [80]:
train.describe()

Unnamed: 0,atom_index_0,atom_index_1,scalar_coupling_constant,x_0,y_0,z_0,x_1,y_1,z_1,XX_0,...,YZ_1,ZZ_1,mulliken_charge_0,mulliken_charge_0.1,potential_energy,X,Y,Z,distance,distance_2
count,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,...,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0
mean,13.35689,5.883966,15.92165,0.1058781,-0.2203691,0.04712888,0.09757184,-0.3956294,0.07768257,29.62281,...,0.2856011,111.0627,0.1221098,0.1221098,-404.2052,-0.01881619,0.08021838,0.2280645,0.2894667,8.073935
std,3.267712,4.993943,34.94198,1.759873,2.121789,1.564513,1.479416,1.767196,1.25591,3.03305,...,17.66086,58.16133,0.04474921,0.04474921,37.15726,2.12338,1.613989,1.015052,2.826685,15.38668
min,0.0,0.0,-36.2186,-9.234889,-9.49416,-9.134765,-9.234889,-9.254405,-8.789131,9.7778,...,-266.099,-548.043,-0.030867,-0.030867,-714.6262,-22.958,-9.2523,-6.0285,-23.3783,0.0
25%,11.0,2.0,-0.254978,-0.9530924,-1.836966,-0.9642128,-0.7927602,-1.607567,-0.6696343,27.61585,...,-4.4543,57.28035,0.098397,0.098397,-424.4322,-1.1822,-0.8477,-0.3028,-1.4471,0.5470082
50%,13.0,5.0,2.28113,0.1341212,-0.2979936,0.004972893,0.04660419,-0.5081915,0.01549763,29.2507,...,0.0003,127.238,0.113503,0.113503,-403.2927,-0.0001,0.0632,0.1219,0.1426,2.947746
75%,16.0,8.0,7.390655,1.227746,1.753418,1.035102,0.968224,0.2882334,0.8926133,31.2162,...,4.8144,153.44,0.12879,0.12879,-385.9742,1.1369,1.0674,0.8839,1.9749,9.588932
max,28.0,28.0,204.88,9.38224,9.714469,7.637578,8.420893,8.653665,7.460225,47.9444,...,298.117,339.694,0.403094,0.403094,-40.52368,21.8738,13.0056,6.8801,21.5378,546.5449


In [81]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4658147 entries, 0 to 4658146
Data columns (total 33 columns):
molecule_name               object
atom_index_0                int64
atom_index_1                int64
type                        object
scalar_coupling_constant    float64
atom_0                      object
x_0                         float64
y_0                         float64
z_0                         float64
atom_1                      object
x_1                         float64
y_1                         float64
z_1                         float64
XX_0                        float64
XY_0                        float64
YY_0                        float64
XZ_x                        float64
YZ_0                        float64
ZZ_0                        float64
XX_1                        float64
XY_1                        float64
YY_1                        float64
XZ_y                        float64
YZ_1                        float64
ZZ_1                        flo

In [83]:
train.type.value_counts()

3JHC    1510379
2JHC    1140674
1JHC     709416
3JHH     590611
2JHH     378036
3JHN     166415
2JHN     119253
1JHN      43363
Name: type, dtype: int64

In [86]:
train.atom_0.value_counts()

H    4658147
Name: atom_0, dtype: int64

In [87]:
train.atom_1.value_counts()

C    3360469
H     968647
N     329031
Name: atom_1, dtype: int64

## Reduce memory usage helper function

In [96]:
# a helper method that reduce the memory usage of dataframe
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) /
                                        start_mem))

    return df

## feature engineering

In [89]:
# create distance feature
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

In [90]:
train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])

In [102]:
# create all other features
def create_features(df):
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    df = reduce_mem_usage(df)
    return df

In [103]:
train = create_features(train)
test = create_features(test)

Memory usage of dataframe is 2273.19 MB
molecule_name


TypeError: Categorical is not ordered for operation min
you can use .as_ordered() to change the Categorical to an ordered one


In [115]:
train_sample = train.sample(n=300, random_state=42).copy()

In [116]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        print(col_type)
        '''
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    '''
    return df

In [117]:
reduce_mem_usage(train_sample)

category
int8
int8
category
float16
category
float16
float16
float16
category
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
mulliken_charge_0    float64
mulliken_charge_0    float64
dtype: object
mulliken_charge_0    float64
mulliken_charge_0    float64
dtype: object
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
object
float64
float64
float64
float16
float16
float16
float16
float16
float16
float16
float16
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64


Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,...,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
3286384,dsgdb9nsd_095038,11,0,1JHC,79.625000,H,-0.638184,1.899414,0.757812,C,...,-0.273359,0.006351,-1.087661,1.093956,-0.000056,0.999949,1.105711,1.084384,0.006351,-1.087661
3155519,dsgdb9nsd_091960,16,5,1JHC,110.625000,H,-2.685547,-1.221680,1.714844,C,...,-0.282131,0.004489,-1.078917,1.088771,0.005365,1.004952,1.095119,1.083406,0.004489,-1.078917
1497715,dsgdb9nsd_048643,12,2,3JHC,6.546875,H,2.917969,4.507812,0.390869,C,...,-2.351632,0.315088,-2.825409,3.072028,-0.068469,0.978198,3.606908,2.444885,0.323257,-2.817241
2655333,dsgdb9nsd_079828,12,16,3JHH,3.429688,H,-3.070312,-0.687500,-1.369141,H,...,-2.337386,0.298854,-2.450160,2.760977,0.011962,1.004352,2.952060,2.622248,0.127672,-2.621342
741761,dsgdb9nsd_024353,9,1,2JHC,-1.414062,H,-0.509766,1.816406,0.966309,C,...,-1.343995,0.136403,-2.013728,2.145962,-0.004168,0.998061,2.238090,2.027098,0.069154,-2.080976
698328,dsgdb9nsd_022708,13,2,2JHC,-0.078430,H,-2.480469,-0.814941,-2.041016,C,...,-1.516618,0.214976,-2.049438,2.255301,-0.009112,0.995976,2.398589,1.933828,0.118588,-2.145826
3090235,dsgdb9nsd_090452,11,8,3JHC,0.926758,H,-0.618164,2.033203,0.779297,C,...,-2.320228,0.372504,-2.758722,3.219218,0.087992,1.028102,3.543291,2.630152,0.253626,-2.877599
2792519,dsgdb9nsd_083133,17,18,2JHH,-11.671875,H,-0.485107,-1.673828,2.650391,H,...,-1.284929,0.202534,-1.573847,1.770814,-0.005566,0.996866,1.778289,1.764171,0.005661,-1.770720
2735538,dsgdb9nsd_081823,15,6,3JHC,5.503906,H,2.507812,0.750488,-1.112305,C,...,-1.955621,0.359178,-2.390425,3.161430,0.411826,1.149777,3.600774,2.749603,0.290624,-2.458979
4626019,dsgdb9nsd_131714,11,3,3JHN,1.228516,H,0.627441,-0.457031,0.679199,N,...,-2.311295,0.323471,-2.795796,2.929751,-0.189515,0.939244,3.218040,2.523720,0.311491,-2.807775


In [110]:
train_sample.shape

(300, 88)