# II. Creating the train and test data 

Uses the ACSF structures created previously and merges them to the train and test dataset

In [1]:
import pandas as pd
import numpy as np
import time, copy
import matplotlib.pyplot as plt
import os

In [2]:
file_folder = '../input/champs-scalar-coupling'
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
mulliken = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
scalar_coupling = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')

In [3]:
acsf_folder = '../input/acsfstructures'
acsf_structures = pd.read_pickle(f'{acsf_folder}/acsf.pkl')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

df_acsf = reduce_mem_usage(acsf_structures)

df_train = reduce_mem_usage(train)
df_test = reduce_mem_usage(test)
df_mulliken = reduce_mem_usage(mulliken)
df_scalar_coupling = reduce_mem_usage(scalar_coupling)
print(train.shape, test.shape)#, df_train_sub_charge.shape, df_train_sub_tensor.shape)

Mem. usage decreased to 515.11 Mb (73.7% reduction)
Mem. usage decreased to 106.62 Mb (50.0% reduction)
Mem. usage decreased to 52.57 Mb (45.0% reduction)
Mem. usage decreased to 16.09 Mb (54.2% reduction)
Mem. usage decreased to 115.50 Mb (59.4% reduction)
(4658147, 6) (2505542, 5)


In [5]:
df_train = df_train.merge(df_acsf, left_on = ['molecule_name', 'atom_index_0'], right_on=['molecule_name','atom_index'])
df_train.drop(columns=['atom_index', 'atom'], inplace=True)
df_train = df_train.merge(df_acsf, left_on = ['molecule_name', 'atom_index_1'], right_on=['molecule_name','atom_index'], suffixes=('_0', '_1'))
df_train.drop(columns=['atom_index', 'atom'], inplace=True)
df_train = reduce_mem_usage(df_train)
df_train.head()

Mem. usage decreased to 2025.71 Mb (0.0% reduction)


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,g1_0,g2_2_0.01_0,g4_2_0.01_1_1_0,g4_2_0.01_1_8_0,...,g4_1_0.2_0.5_1_1,g4_1_0.2_0.5_8_1,g4_1_0.2_0.5_16_1,g2_1.5_0.2_1,g4_1.5_0.2_1_1_1,g4_1.5_0.2_1_8_1,g4_1.5_0.2_1_16_1,g4_1.5_0.2_0.5_1_1,g4_1.5_0.2_0.5_8_1,g4_1.5_0.2_0.5_16_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,0.763672,0.759277,0.241821,0.121094,...,0.194336,0.000424,3.576279e-07,1.955078,0.162354,7.4e-05,0.0,0.202881,0.000443,4.172325e-07
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,0.763672,0.759277,0.241821,0.121155,...,0.194336,0.000424,3.576279e-07,1.955078,0.162354,7.4e-05,0.0,0.202881,0.000443,4.172325e-07
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,0.763672,0.759277,0.241821,0.121094,...,0.194336,0.000424,3.576279e-07,1.955078,0.162354,7.4e-05,0.0,0.202881,0.000443,4.172325e-07
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,0.763672,0.759277,0.241821,0.121094,...,0.194336,0.000424,3.576279e-07,1.955078,0.162354,7.4e-05,0.0,0.202881,0.000443,4.172325e-07
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,0.763672,0.759277,0.241821,0.121094,...,0.16748,0.014214,0.0008544922,0.743164,0.226562,0.113464,0.052307,0.176025,0.014885,0.000893116


In [6]:
df_test = df_test.merge(df_acsf, left_on = ['molecule_name', 'atom_index_0'], right_on=['molecule_name','atom_index'])
df_test.drop(columns=['atom_index', 'atom'], inplace=True)
df_test = df_test.merge(df_acsf, left_on = ['molecule_name', 'atom_index_1'], right_on=['molecule_name','atom_index'], suffixes=('_0', '_1'))
df_test.drop(columns=['atom_index', 'atom'], inplace=True)
df_test = reduce_mem_usage(df_test)
df_test.head()

Mem. usage decreased to 1084.82 Mb (0.0% reduction)


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,g1_0,g2_2_0.01_0,g4_2_0.01_1_1_0,g4_2_0.01_1_8_0,g4_2_0.01_1_16_0,...,g4_1_0.2_0.5_1_1,g4_1_0.2_0.5_8_1,g4_1_0.2_0.5_16_1,g2_1.5_0.2_1,g4_1.5_0.2_1_1_1,g4_1.5_0.2_1_8_1,g4_1.5_0.2_1_16_1,g4_1.5_0.2_0.5_1_1,g4_1.5_0.2_0.5_8_1,g4_1.5_0.2_0.5_16_1
0,4658147,dsgdb9nsd_000004,2,0,2JHC,0.526855,0.522461,0.0,0.0,0.0,...,0.0,0.0,0.0,0.928711,0.0,0.0,0.0,0.0,0.0,0.0
1,4658150,dsgdb9nsd_000004,3,0,1JHC,0.526855,0.522461,0.0,0.0,0.0,...,0.0,0.0,0.0,0.928711,0.0,0.0,0.0,0.0,0.0,0.0
2,4658148,dsgdb9nsd_000004,2,1,1JHC,0.526855,0.522461,0.0,0.0,0.0,...,0.0,0.0,0.0,0.928711,0.0,0.0,0.0,0.0,0.0,0.0
3,4658151,dsgdb9nsd_000004,3,1,2JHC,0.526855,0.522461,0.0,0.0,0.0,...,0.0,0.0,0.0,0.928711,0.0,0.0,0.0,0.0,0.0,0.0
4,4658149,dsgdb9nsd_000004,2,3,3JHH,0.526855,0.522461,0.0,0.0,0.0,...,0.0,0.0,0.0,0.506836,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_train.to_pickle('df_train_new.pkl')
df_test.to_pickle('df_test_new.pkl')