# Molecular chemistry with NN( Keras Multiple Output)

This notebook is prepared to generate more features

# 1. Load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from datetime import datetime
from tqdm import tqdm
from tqdm import tqdm_notebook
# %tensorflow_version 1.x # if you want newest  2.x  delete this line
# generally if you put this line somewher in the middle of the notebook there is need to restart the runtime

import tensorflow as tf
from keras.layers import Dense, Input, Activation
from keras.layers import BatchNormalization,Add,Dropout
from keras.optimizers import Adam
from keras.models import Model, load_model
from keras import callbacks
from keras import backend as K 
from keras.layers.advanced_activations import LeakyReLU
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)
import os
# %cd /kaggle/input/champs-scalar-coupling
# print(os.listdir("."))

# Any results you write to the current directory are saved as output.
print('Tensorflow  v', tf.__version__)

Tensorflow  v 2.4.1


In [2]:
import keras
print(keras.__version__)

2.4.3


## Mount google drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load standard snipet to prevent random disconnects

This cell runs JS code to automatic reconnect to runtime.

In [4]:
import IPython
from google.colab import output

display(IPython.display.Javascript('''
 function ClickConnect(){
   btn = document.querySelector("colab-connect-button")
   if (btn != null){
     console.log("Click colab-connect-button"); 
     btn.click() 
     }
   
   btn = document.getElementById('ok')
   if (btn != null){
     console.log("Click reconnect"); 
     btn.click() 
     }
  }
  
setInterval(ClickConnect,60000)
'''))

print("Done.")

<IPython.core.display.Javascript object>

Done.


## Check GPU

*   Google Colab can provide you with one of Tesla graphics cards: K80, T4, P4 or P100
*   Here you can check the model of GPU before using some advanced features

In [5]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# 2.Creating features

In [6]:
file_folder='/content/drive/MyDrive/mole'

In [7]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
sub = pd.read_csv('/content/drive/MyDrive/mole/submit/sample_submission.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [8]:
print(f'There are {train.shape[0]} rows in train data.')
print(f'There are {test.shape[0]} rows in test data.')

print(f"There are {train['molecule_name'].nunique()} distinct molecules in train data.")
print(f"There are {test['molecule_name'].nunique()} distinct molecules in test data.")
print(f"There are {train['atom_index_0'].nunique()} unique atoms.")
print(f"There are {train['type'].nunique()} unique types.")

There are 4659076 rows in train data.
There are 2505190 rows in test data.
There are 85012 distinct molecules in train data.
There are 45777 distinct molecules in test data.
There are 28 unique atoms.
There are 8 unique types.


In [9]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [10]:
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

In [11]:
train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])

In [12]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage changed to {:5.2f} Mb ({:.1f}% change)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [13]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage changed to 337.69 Mb (52.5% change)
Mem. usage changed to 176.80 Mb (51.3% change)


In [14]:
    num_cols = ['x_1', 'y_1', 'z_1', 'dist', 'dist_x', 'dist_y', 'dist_z']
    cat_cols = ['atom_index_0', 'atom_index_1', 'type', 'atom_1', 'type_0']
    aggs = ['max', 'min'] #mozeesci rozbic to na dwie cz
    aggs_1 = ['mean',  'std', ] #mozeesci rozbic to na dwie cz

In [15]:
def create_features_full_a(df):

    
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['molecule_dist_std'] = df.groupby('molecule_name')['dist'].transform('std')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    return df

In [16]:
train = create_features_full_a(train)
test = create_features_full_a(test)
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage changed to 391.01 Mb (22.8% change)
Mem. usage changed to 205.47 Mb (23.2% change)


In [17]:
def create_features_full(df, aggs):

    for col in cat_cols:
        df[f'molecule_{col}_count'] = df.groupby('molecule_name')[col].transform('count')

    for cat_col in tqdm_notebook(cat_cols):
        for num_col in num_cols:
            for agg in aggs:
                df[f'molecule_{cat_col}_{num_col}_{agg}'] = df.groupby(['molecule_name', cat_col])[num_col].transform(agg)
                df[f'molecule_{cat_col}_{num_col}_{agg}_diff'] = df[f'molecule_{cat_col}_{num_col}_{agg}'] - df[num_col]
                df[f'molecule_{cat_col}_{num_col}_{agg}_div'] = df[f'molecule_{cat_col}_{num_col}_{agg}'] / df[num_col]
            df = reduce_mem_usage(df) #te dwie dodalem
        df = reduce_mem_usage(df) #te dwie dodalem



    df = reduce_mem_usage(df)
    return df

In [None]:
%%time
test= create_features_full(test, aggs)
test=create_features_full(test, aggs_1)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Mem. usage changed to 286.70 Mb (13.0% change)
Mem. usage changed to 344.04 Mb (-9.1% change)
Mem. usage changed to 401.37 Mb (-7.7% change)
Mem. usage changed to 430.04 Mb (0.0% change)
Mem. usage changed to 473.05 Mb (-3.1% change)
Mem. usage changed to 516.05 Mb (-2.9% change)
Mem. usage changed to 559.06 Mb (-2.6% change)
Mem. usage changed to 559.06 Mb (0.0% change)
Mem. usage changed to 587.73 Mb (0.0% change)
Mem. usage changed to 616.40 Mb (0.0% change)
Mem. usage changed to 645.07 Mb (0.0% change)
Mem. usage changed to 673.74 Mb (0.0% change)
Mem. usage changed to 716.74 Mb (-2.0% change)
Mem. usage changed to 759.75 Mb (-1.9% change)
Mem. usage changed to 802.75 Mb (-1.8% change)
Mem. usage changed to 802.75 Mb (0.0% change)
Mem. usage changed to 860.09 Mb (-3.4% change)
Mem. usage changed to 917.43 Mb (-3.2% change)
Mem. usage changed to 974.77 Mb (-3.0% change)
Mem. usage changed to 1003.44 Mb (0.0% change)
Mem. usage changed to 1046.44 Mb (-1.4% change)


train should be splitted even more as it is crashes 

In [None]:
%%time
train= create_features_full(train, aggs)
train=create_features_full(train, aggs_1)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Mem. usage changed to 488.76 Mb (14.1% change)
Mem. usage changed to 595.39 Mb (-9.8% change)
Mem. usage changed to 702.03 Mb (-8.2% change)
Mem. usage changed to 755.35 Mb (0.0% change)
Mem. usage changed to 835.33 Mb (-3.3% change)
Mem. usage changed to 915.31 Mb (-3.0% change)
Mem. usage changed to 995.29 Mb (-2.8% change)
Mem. usage changed to 995.29 Mb (0.0% change)
Mem. usage changed to 1048.60 Mb (0.0% change)
Mem. usage changed to 1101.92 Mb (0.0% change)
Mem. usage changed to 1155.24 Mb (0.0% change)
Mem. usage changed to 1208.56 Mb (0.0% change)
Mem. usage changed to 1288.54 Mb (-2.1% change)
Mem. usage changed to 1368.52 Mb (-2.0% change)
Mem. usage changed to 1448.50 Mb (-1.9% change)
Mem. usage changed to 1448.50 Mb (0.0% change)
Mem. usage changed to 1555.13 Mb (-3.6% change)
Mem. usage changed to 1661.77 Mb (-3.3% change)
Mem. usage changed to 1768.41 Mb (-3.1% change)
Mem. usage changed to 1821.73 Mb (0.0% change)
Mem. usage changed to 1901.71 Mb (-1.4% change)
Mem. usa

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Mem. usage changed to 3376.86 Mb (5.0% change)
Mem. usage changed to 3465.73 Mb (1.3% change)
Mem. usage changed to 3554.59 Mb (1.2% change)
Mem. usage changed to 3607.91 Mb (2.2% change)
Mem. usage changed to 3714.55 Mb (0.7% change)
Mem. usage changed to 3821.19 Mb (0.7% change)
Mem. usage changed to 3927.83 Mb (0.7% change)
Mem. usage changed to 3927.83 Mb (0.0% change)
Mem. usage changed to 3981.14 Mb (2.0% change)
Mem. usage changed to 4034.46 Mb (1.9% change)
Mem. usage changed to 4087.78 Mb (1.9% change)
Mem. usage changed to 4141.10 Mb (1.9% change)
Mem. usage changed to 4247.74 Mb (0.6% change)
Mem. usage changed to 4354.38 Mb (0.6% change)
Mem. usage changed to 4461.01 Mb (0.6% change)
Mem. usage changed to 4461.01 Mb (0.0% change)
Mem. usage changed to 4549.88 Mb (1.0% change)
Mem. usage changed to 4638.74 Mb (0.9% change)
Mem. usage changed to 4727.61 Mb (0.9% change)
Mem. usage changed to 4780.93 Mb (1.6% change)
Mem. usage changed to 4887.57 Mb (0.5% change)
Mem. usage ch