# Exploring dataset

Major csv files from this competiton are train.csv, test.csv, sample_submission.csv, structures.csv. Thus, we will explore these files first.
    - train.csv: the training set
    - test.csv: the testing set
    - sample_submission.csv: a sample submission file in the correct format
    - structures.csv: molecular structure file, X Y Z
    

In [50]:
# import basic packages for ML
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt


In [51]:
# read all the major csv files
folder_name = 'champs-scalar-coupling/'
train = pd.read_csv(folder_name + 'train.csv')
test = pd.read_csv(folder_name + 'test.csv')
sample_submission = pd.read_csv(folder_name + 'sample_submission.csv')
structures = pd.read_csv(folder_name + 'structures.csv')

Additional dataset contains the following csv files:
    - dipole_moments.csv: contains the molecular electric dipole moments
    - magnetic_shielding_tensors.csv: contains the magnetic shielding tensors for all atoms in the molecules
    - mulliken_charges.csv: contains the mulliken charges for all atoms in the molecules.
    - potential_energy.csv: contains the potential energy of the molecules
    - scalar_coupling_contributions.csv: contains four types of contributions

In [52]:
# read additional files
dipole_moments = pd.read_csv(folder_name + 'dipole_moments.csv')
mag_shield_tensors = pd.read_csv(folder_name +
                                 'magnetic_shielding_tensors.csv')
mull_charges = pd.read_csv(folder_name + 'mulliken_charges.csv')
pot_energy = pd.read_csv(folder_name + 'potential_energy.csv')
sca_cou_contri = pd.read_csv(folder_name + 'scalar_coupling_contributions.csv')

## Merge datasets

In [53]:
# drop id column in train df
train.drop('id', axis=1, inplace=True)

In [54]:
# create distance features in dipole_moments
dipole_moments['distance'] = dipole_moments['X']+dipole_moments['Y']+dipole_moments['Z']
dipole_moments['distance_2'] = dipole_moments['distance'].apply(lambda x: x**2)

In [55]:
# merge train/test and structures
def map_atom_info(df, other_df, other_df_name, atom_idx):
    # merge train and structures
    if other_df_name == 'structures':
        df = pd.merge(df,
                      other_df,
                      how='left',
                      left_on=['molecule_name', f'atom_index_{atom_idx}'],
                      right_on=['molecule_name', 'atom_index'])
        
        df = df.drop('atom_index', axis=1)
        
        df = df.rename(
            columns={
                'atom': f'atom_{atom_idx}',
                'x': f'x_{atom_idx}',
                'y': f'y_{atom_idx}',
                'z': f'z_{atom_idx}'
            })
    # merge train and mag_shield_tensors
    elif other_df_name == 'mag_shield_tensors':
        other_df = other_df.drop(['YX', 'ZX', 'ZY'], axis=1)
        df = pd.merge(df,
                      other_df,
                      how='left',
                      left_on=['molecule_name', f'atom_index_{atom_idx}'],
                      right_on=['molecule_name', 'atom_index'])
        df.drop('atom_index', axis=1, inplace=True)
        df = df.rename(
            columns={
                'atom': f'atom_{atom_idx}',
                'XX': f'XX_{atom_idx}',
                'YY': f'YY_{atom_idx}',
                'ZZ': f'ZZ_{atom_idx}',
                'XY': f'XY_{atom_idx}',
                'YZ': f'YZ_{atom_idx}',
            })
    # merge train and mulliken_charges
    elif other_df_name == 'mull_charges':
        df = pd.merge(df,
                     other_df,
                     how='left',
                     left_on=['molecule_name', f'atom_index_{atom_idx}'],
                     right_on=['molecule_name', 'atom_index'])
        df.drop('atom_index', axis=1, inplace=True)
        df = df.rename(
            columns = {'mulliken_charge': f'mulliken_charge_{atom_idx}'}
        )

    else:
        print(f'{other_df} dataframe is not found')
        
    return df


In [None]:
# start merging
merge_dfs = ['structures', 'mag_shield_tensors', 'mull_charges']
train = map_atom_info(train, other_df, 'structures', 0)
train = map_atom_info(train, other_df, 'structures', 1)
train = map_atom_info(train, other_df, 'mag_shield_tensors', 0)
train = map_atom_info(train, other_df, 'mag_shield_tensors', 1)
train = map_atom_info(train, other_df, 'mull_charges', 0)
train = map_atom_info(train, other_df, 'mull_charges', 0)

test = map_atom_info(test, structures, 'structures', 0)
test = map_atom_info(test, structures, 'structures', 1)
test = map_atom_info(test, structures, 'mag_shield_tensors', 0)
test = map_atom_info(test, structures, 'mag_shield_tensors', 1)
test = map_atom_info(test, structures, 'mull_charges', 0)
test = map_atom_info(test, structures, 'mull_charges', 1)