# Libraries

In [1]:
import joblib
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [13]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import InputLayer, Dense, Dropout, Conv1D, Conv2D, AveragePooling1D, AveragePooling2D, LeakyReLU
from tensorflow.keras.layers import Input, Flatten, Reshape, Lambda, BatchNormalization, Conv2D
from tensorflow.keras.layers import MaxPooling2D, GlobalAveragePooling2D, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications.efficientnet import EfficientNetB1
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K

In [5]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [6]:
def r2_metric(y_true, y_pred):
    numerator = K.sum((y_true - y_pred) ** 2, axis=0)
    denominator = K.sum((y_true - K.mean(y_pred, axis=0)) ** 2, axis=0)
    return 1-(numerator/denominator)

In [7]:
def LMAE(y_actual, y_pred): 
    lmae=tf.experimental.numpy.log10(K.mean(K.abs(y_actual - y_pred), axis=0))
    return lmae

# Tabular Input head

In [10]:
# model accept input from tabular data
tab_model = Sequential([
    InputLayer(input_shape=(144,), dtype='float32'),
    Dense(128, activation='relu'),
    Dense(256, activation='relu'),
    Dense(256, activation='relu', kernel_regularizer=L2(0.01)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu', kernel_regularizer=L2(0.01)),
    Dense(16, activation='relu'),
    Dense(4, activation='relu', kernel_regularizer=L2(0.01)),
])

# DeepInsight Image Input head

In [8]:
# 1st architect with 256 batch size
img_model = Sequential([
    InputLayer(input_shape=(50,50,3), dtype='float32'),
    Conv2D(32, 3, activation='relu', padding='same'),
    Conv2D(64, 3, activation='relu', padding='same'),
    Conv2D(64, 3, activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2,2)),
    Dropout(0.25),
    Conv2D(128, 3, activation='relu', padding='same'),
    Conv2D(128, 3, activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2,2)),
    Dropout(0.25),
    Conv2D(256, 3, activation='relu', padding='same'),
    Conv2D(256, 3, activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2,2)),
    Dropout(0.25),
    Flatten(),
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dense(16, activation='relu'),
    Dense(4, activation='relu'),
])

# Combine model

In [15]:
x = concatenate([tab_model.output, img_model.output])
x = Dense(4, activation='relu')(x)
x = Dense(4, activation='relu')(x)
x = Dense(1, activation='linear')(x)

In [16]:
combined_model = Model(inputs=[tab_model.input, img_model.input], outputs=x)

In [17]:
combined_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50, 50, 3)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 50, 50, 32)   896         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 50, 50, 64)   18496       conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 50, 50, 64)   36928       conv2d_1[0][0]                   
____________________________________________________________________________________________

# Load and prepare data

In [19]:
tab_train = pd.read_csv('train_extra_for_gan.csv')
tab_train

Unnamed: 0,volume,volume/g,density,weight,surface_area,void_fraction,void_volume,CO2/N2_selectivity,heat_adsorption,n_atoms,...,topology_1,topology_2,topology_3,topology_4,topology_5,topology_6,topology_7,topology_8,topology_9,topology_10
0,-0.524386,-0.584699,0.798882,-0.383085,-0.257582,-0.872930,-0.642857,0.157631,0.609280,-0.406977,...,0,0,0,0,1,0,0,0,0,0
1,0.282926,-0.615547,0.856904,0.949117,-0.604123,-0.588382,-0.513449,0.691708,0.845745,0.976744,...,0,1,0,0,0,0,0,0,0,0
2,-0.537500,-0.411402,0.509126,-0.484315,-0.491038,-0.536250,-0.447101,-0.021201,0.322523,-0.325581,...,0,0,0,0,1,0,0,0,0,0
3,0.007296,-0.043730,0.045099,0.044946,-0.091623,-0.201260,-0.160789,0.298554,0.219176,0.023256,...,0,0,0,0,0,0,0,1,0,0
4,-0.514063,-0.603040,0.833117,-0.356674,-0.401136,-0.878771,-0.647639,0.512156,0.407682,-0.186047,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68597,-0.489396,-0.701596,1.030967,-0.257028,-1.162504,1.944400,0.590556,0.220592,-2.777374,0.104651,...,0,0,0,0,1,0,0,0,0,0
68598,-0.333898,-0.933076,1.612110,0.233003,-0.411015,-1.200729,-0.804148,-0.676437,-2.777374,0.186047,...,0,1,0,0,0,0,0,0,0,0
68599,-0.075575,-0.894122,1.500426,0.697741,-1.260756,2.398555,0.590556,-0.487479,-2.777374,1.093023,...,0,0,0,0,0,0,0,0,0,0
68600,0.876831,-0.820367,1.305642,2.371556,-1.199915,2.210121,0.590556,-0.715515,-2.777374,2.953488,...,0,0,0,0,0,0,0,0,0,0


In [20]:
img_train = np.load('np_files/train_image_matrix_50_50.npy')
img_train.shape

(68602, 50, 50, 3)

In [22]:
target = pd.read_csv('target_extra_for_gan.csv')
target

Unnamed: 0,CO2_working_capacity [mL/g]
0,105.284502
1,101.224774
2,118.987011
3,187.626004
4,79.210001
...,...
68597,-12.943652
68598,-12.985582
68599,-13.187635
68600,15.672698


In [23]:
target['is_good'] = 0
target.loc[target['CO2_working_capacity [mL/g]']>200, 'is_good'] = 1

In [24]:
target['is_good'].value_counts()

0    57143
1    11459
Name: is_good, dtype: int64

In [25]:
is_good = target['is_good'].values
target = target.drop(['is_good'], axis=1).values

In [27]:
## train/val split
tab_X_train, tab_X_val, img_X_train, img_X_val, y_train, y_val = train_test_split(tab_train, img_train, target, 
                                                                                  test_size= 0.2, 
                                                                                  random_state=42,
                                                                                  stratify=is_good)

In [28]:
(tab_X_train.shape, tab_X_val.shape)

((54881, 144), (13721, 144))

In [29]:
(img_X_train.shape, img_X_val.shape)

((54881, 50, 50, 3), (13721, 50, 50, 3))

In [30]:
(y_train.shape, y_val.shape)

((54881, 1), (13721, 1))

# Train

In [35]:
BATCH_SIZE = 256
EPOCHS = 100
INIT_LR = 0.001

In [36]:
tf.keras.backend.clear_session()

In [37]:
optimizer = Adam(learning_rate=INIT_LR, decay=INIT_LR/EPOCHS, beta_1=0.9, beta_2=0.999, epsilon=1e-8)

In [38]:
# callbacks
early_stopping = EarlyStopping(patience=10, restore_best_weights=True, monitor='val_loss', mode='min')

In [40]:
combined_model.compile(optimizer=optimizer, loss=LMAE, metrics=[])

In [41]:
combined_model.fit(x = [tab_X_train, img_X_train],
          y = y_train,
          validation_data=([tab_X_val, img_X_val], y_val),
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[early_stopping]
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


<tensorflow.python.keras.callbacks.History at 0x28a3066ab48>

In [42]:
combined_model.save('models/combined_deepinsight_cnn_50_50.h5', save_format='h5')

# Test submission

In [46]:
tab_test = pd.read_csv('test_extra_2_cleaned.csv')
tab_test

Unnamed: 0,volume,volume/g,density,weight,surface_area,void_fraction,void_volume,CO2/N2_selectivity,heat_adsorption,n_atoms,...,topology_1,topology_2,topology_3,topology_4,topology_5,topology_6,topology_7,topology_8,topology_9,topology_10
0,-0.479628,-0.272893,0.314058,-0.460370,-0.614563,-0.704325,-0.513748,0.841859,0.753026,-0.302326,...,0,0,0,0,1,0,0,0,0,0
1,0.945696,-0.807447,1.273569,2.466234,-0.451899,-0.222836,-0.399283,-0.064565,-0.481679,0.558140,...,0,0,0,0,0,0,0,0,0,0
2,-0.287257,0.548750,-0.446130,-0.510708,0.956839,0.381305,0.483264,-0.329196,-0.530118,-0.581395,...,0,0,0,0,1,0,0,0,0,0
3,0.120337,0.943906,-0.672449,-0.262907,1.180306,0.698967,0.957860,-0.501135,-0.490294,-0.186047,...,0,0,0,0,1,0,0,0,0,0
4,0.455792,0.798101,-0.595773,0.077741,0.664925,0.626321,0.813210,-0.333562,-0.371552,0.093023,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,14.883110,6.775274,-1.708149,3.453108,2.525318,2.492180,8.834429,-0.752689,-1.772980,3.558140,...,0,0,0,1,0,0,0,0,0,0
16996,1.407069,2.165319,-1.115874,0.238821,1.483792,1.602959,2.779438,-0.642975,-1.437719,0.162791,...,0,0,0,1,0,0,0,0,0,0
16997,1.211109,2.360628,-1.164993,0.062301,1.649838,1.398200,2.669157,-0.746253,-1.480803,0.023256,...,0,0,0,0,1,0,0,0,0,0
16998,1.217125,2.786262,-1.258839,-0.045891,1.676200,1.672998,3.387029,-0.738450,-1.571499,-0.069767,...,0,0,0,0,1,0,0,0,0,0


In [45]:
mat_test = np.load('np_files/test_image_matrix_50_50.npy')
mat_test.shape

(17000, 50, 50, 3)

In [48]:
predict = combined_model.predict([tab_test, mat_test]).flatten()

predicted_df = pd.DataFrame({"id":range(68614,68614+len(test)), "CO2_working_capacity [mL/g]":predict})
predicted_df

Unnamed: 0,id,CO2_working_capacity [mL/g]
0,68614,166.052383
1,68615,54.064545
2,68616,66.692818
3,68617,67.646645
4,68618,62.072334
...,...,...
16995,85609,-7.133041
16996,85610,-0.405801
16997,85611,-0.310039
16998,85612,-0.344075


In [49]:
predicted_df.to_csv("output/phase2/submission.csv",index=False)

# Turn model to feature extractor

In [50]:
from tensorflow.keras.models import load_model

In [51]:
loaded_model = load_model('models/combined_deepinsight_cnn_50_50.h5', custom_objects={'LMAE':LMAE})

In [52]:
loaded_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50, 50, 3)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 50, 50, 32)   896         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 50, 50, 64)   18496       conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 50, 50, 64)   36928       conv2d_1[0][0]                   
____________________________________________________________________________________________

In [53]:
feats_ext = tf.keras.Model(inputs = loaded_model.input, outputs = loaded_model.layers[-2].output)

In [54]:
feats_ext.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50, 50, 3)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 50, 50, 32)   896         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 50, 50, 64)   18496       conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 50, 50, 64)   36928       conv2d_1[0][0]                   
______________________________________________________________________________________________

In [55]:
feats_ext.save('models/combined_di_cnn_feats_ext.h5', save_format='h5')



# Train new feats

In [56]:
feats_ext = load_model('models/combined_di_cnn_feats_ext.h5', custom_objects={'LMAE':LMAE})



In [58]:
new_feats = feats_ext.predict([tab_train, img_train])
new_feats.shape

(68602, 4)

In [59]:
new_feats_cols = [f'comb_cnn_{i}' for i in range(new_feats.shape[1])]
new_feats_df = pd.DataFrame(new_feats, columns=new_feats_cols)
new_feats_df

Unnamed: 0,comb_cnn_0,comb_cnn_1,comb_cnn_2,comb_cnn_3
0,0.000000,0.000000,0.000000,58.904251
1,0.000000,0.000000,0.000000,69.381874
2,0.000000,0.000000,7.915159,82.243233
3,0.000000,0.000000,0.000000,117.579170
4,0.000000,0.000000,0.000000,61.207539
...,...,...,...,...
68597,0.716291,7.768239,5.032241,8.482945
68598,0.974198,8.142911,5.304706,7.986442
68599,0.778554,7.931119,5.137025,8.463214
68600,0.799744,7.906666,5.131299,8.344973


In [60]:
new_feats_df.describe()

Unnamed: 0,comb_cnn_0,comb_cnn_1,comb_cnn_2,comb_cnn_3
count,68602.0,68602.0,68602.0,68602.0
mean,0.135411,1.436154,2.168628,88.425812
std,0.439961,3.2326,4.774739,61.025421
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,50.224761
50%,0.0,0.0,0.0,72.503105
75%,0.0,0.0,2.874144,119.262621
max,4.83485,26.855505,56.48967,432.389038


In [61]:
new_feats_df.to_csv('train_feats_from_comb_cnn.csv', index=False)

# Test new feats

In [63]:
test_new_feats = feats_ext.predict([tab_test, mat_test])
test_new_feats.shape

(17000, 4)

In [64]:
test_new_feats_df = pd.DataFrame(test_new_feats, columns=new_feats_cols)
test_new_feats_df

Unnamed: 0,comb_cnn_0,comb_cnn_1,comb_cnn_2,comb_cnn_3
0,0.000000,0.000000,0.000000,121.871979
1,0.000000,0.000000,3.312899,40.470531
2,0.000000,0.000000,0.000000,48.788647
3,0.000000,0.000000,0.000000,49.490231
4,0.000000,0.000000,0.000000,45.390083
...,...,...,...,...
16995,2.446862,9.734859,6.094297,4.706102
16996,1.053595,8.092432,5.274447,7.620755
16997,1.062499,8.433994,5.508307,8.026564
16998,1.148709,9.191586,5.929290,8.741869


In [65]:
test_new_feats_df.describe()

Unnamed: 0,comb_cnn_0,comb_cnn_1,comb_cnn_2,comb_cnn_3
count,17000.0,17000.0,17000.0,17000.0
mean,0.151956,1.199031,2.374463,79.928833
std,0.479456,2.945057,4.912333,57.008751
min,0.0,0.0,0.0,1.379038
25%,0.0,0.0,0.0,47.57119
50%,0.0,0.0,0.0,64.896301
75%,0.0,0.0,4.388823,103.052229
max,4.924204,16.828388,48.217567,421.367798


In [66]:
test_new_feats_df.to_csv('test_feats_from_comb_cnn.csv', index=False)