I. Estimator for Formation energy

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

column_names = [
  'spacegroup', 
  'number_of_total_atoms', 'percent_atom_al', 'percent_atom_ga', 'percent_atom_in', 
  'lattice_vector_1_ang', 'lattice_vector_2_ang', 'lattice_vector_3_ang', 
  'lattice_angle_alpha_degree', 'lattice_angle_beta_degree', 'lattice_angle_gamma_degree'
]

x, label = pd.read_csv('train.csv')[column_names], pd.read_csv('train.csv')['formation_energy_ev_natom']
nomad_train, nomad_test, label_train, label_test = train_test_split(
    x, label, test_size = 0.25, random_state = 33)
#nomad_train_label_formation = pd.read_csv('train.csv')['formation_energy_ev_natom']
#nomad_train_label_bandgap = pd.read_csv('train.csv')['bandgap_energy_ev']                     
nomad_predict = pd.read_csv('test.csv')[column_names]                                

In [23]:
nomad_train.shape

(1800, 11)

In [46]:
nomad_train_label_formation.shape[0]

2400

In [8]:
nomad_predict.shape

(600, 11)

In [2]:
def create_train_input_fn(): 
    return tf.estimator.inputs.pandas_input_fn(
        x = nomad_train,
        y = label_train, 
        batch_size = 32,
        num_epochs = None, # Repeat forever
        shuffle = True)

def create_test_input_fn():
    return tf.estimator.inputs.pandas_input_fn(
        x = nomad_test,
        y = label_test, 
        num_epochs = 1, # Just one epoch
        shuffle = False) # Don't shuffle so we can compare to census_test_labels later

feature_columns = [
    
    # Bucketize the numeric column    
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('spacegroup'), 
        boundaries = [30, 50, 175, 200, 225]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('number_of_total_atoms'), 
        boundaries = [15, 25, 35, 45, 75]),
    
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('percent_atom_al'), 
        boundaries = [0.1667, 0.3854, 0.5833]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('percent_atom_ga'), 
        boundaries = [0.0938, 0.3086, 0.4688]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('percent_atom_in'), 
        boundaries = [0.0625, 0.3060, 0.4688]),
        
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('lattice_vector_1_ang'), 
        boundaries = [6.141, 9.537, 10.292]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('lattice_vector_2_ang'), 
        boundaries = [5.834, 6.383, 9.093]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('lattice_vector_3_ang'), 
        boundaries = [9.298, 10.125, 14.372]),
    
    # numeric features    
    tf.feature_column.numeric_column('lattice_angle_alpha_degree'),
    tf.feature_column.numeric_column('lattice_angle_beta_degree'),
    tf.feature_column.numeric_column('lattice_angle_gamma_degree'),
   
]

estimator = tf.estimator.DNNRegressor(
    feature_columns = feature_columns,
    hidden_units = [1024, 512, 256, 512])

train_input_fn = create_train_input_fn()
estimator.train(train_input_fn, steps = 2600)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f874ae27290>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpr0Mgv7', '_save_summary_steps': 100}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpr0Mgv7/model.ckpt.
INFO:tensorflow:loss = 5.1305, step = 1
INFO:tensorflow:global_step/sec: 27.3025
INFO:tensorflow:loss = 0.403562, step = 101 (3.666 sec)
INFO:tensorflow:global_step/sec: 26.2573
INFO:tensorflow:loss = 0.52776, step = 201 (3.816 sec)
INFO:tensorflow:global_step/sec: 25.2813
INFO:tensorflow:loss = 0.317994,

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7f874ae271d0>

In [3]:
test_input_fn = create_test_input_fn()
estimator.evaluate(test_input_fn)

def create_pred_input_fn():
    return tf.estimator.inputs.pandas_input_fn(
        x = nomad_predict,
        num_epochs = 1, # Just one epoch
        shuffle = False)

pred_input_fn = create_pred_input_fn()
#predict = estimator.predict(pred_input_fn)
predictions = pd.DataFrame(estimator.predict(pred_input_fn))

INFO:tensorflow:Starting evaluation at 2018-02-15-15:27:53
INFO:tensorflow:Restoring parameters from /tmp/tmpr0Mgv7/model.ckpt-5000
INFO:tensorflow:Finished evaluation at 2018-02-15-15:27:53
INFO:tensorflow:Saving dict for global step 5000: average_loss = 0.00960226, global_step = 5000, loss = 1.15227
INFO:tensorflow:Restoring parameters from /tmp/tmpr0Mgv7/model.ckpt-5000


In [10]:
predictions.shape

(600, 1)

In [95]:
!tensorboard --logdir=graphs

  return f(*args, **kwds)
TensorBoard 0.4.0rc3 at http://DRESDEN:6006 (Press CTRL+C to quit)
^C


In [37]:
help(tf.estimator.DNNRegressor.predict)

Help on method predict in module tensorflow.python.estimator.estimator:

predict(self, input_fn, predict_keys=None, hooks=None, checkpoint_path=None) unbound tensorflow.python.estimator.canned.dnn.DNNRegressor method
    Yields predictions for given features.
    
    Args:
      input_fn: Input function returning features which is a dictionary of
        string feature name to `Tensor` or `SparseTensor`. If it returns a
        tuple, first item is extracted as features. Prediction continues until
        `input_fn` raises an end-of-input exception (`OutOfRangeError` or
        `StopIteration`).
      predict_keys: list of `str`, name of the keys to predict. It is used if
        the `EstimatorSpec.predictions` is a `dict`. If `predict_keys` is used
        then rest of the predictions will be filtered from the dictionary. If
        `None`, returns all.
      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
        inside the prediction call.
      checkpoint_pa

In [90]:
type(label_train)

pandas.core.series.Series

II. Estimator for Bandgap energy

In [12]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

xb, bandgap = pd.read_csv('train.csv')[column_names], pd.read_csv('train.csv')['bandgap_energy_ev']
xb_train, xb_test, bdp_train, bdp_test = train_test_split(
    xb, bandgap, test_size = 0.25, random_state = 33)
                  
bandgap_predict = pd.read_csv('test.csv')[column_names]                                

def create_train_input_fn(): 
    return tf.estimator.inputs.pandas_input_fn(
        x = xb_train,
        y = bdp_train, 
        batch_size = 32,
        num_epochs = None, # Repeat forever
        shuffle = True)

def create_test_input_fn():
    return tf.estimator.inputs.pandas_input_fn(
        x = xb_test,
        y = bdp_test, 
        num_epochs = 1, # Just one epoch
        shuffle = False)

bdp_estimator = tf.estimator.DNNRegressor(
    feature_columns = feature_columns,
    hidden_units = [1024, 512, 256, 128])

train_input_fn = create_train_input_fn()
bdp_estimator.train(train_input_fn, steps = 2000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f45023db250>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpfYkQJ1', '_save_summary_steps': 100}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpfYkQJ1/model.ckpt.
INFO:tensorflow:loss = 391.575, step = 1
INFO:tensorflow:global_step/sec: 97.5683
INFO:tensorflow:loss = 29.0157, step = 101 (1.028 sec)
INFO:tensorflow:global_step/sec: 82.9586
INFO:tensorflow:loss = 30.6525, step = 201 (1.215 sec)
INFO:tensorflow:global_step/sec: 104.075
INFO:tensorflow:loss = 31.7412, 

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7f45023db050>

In [14]:
test_input_fn = create_test_input_fn()
estimator.evaluate(test_input_fn)

def create_pred_input_fn():
    return tf.estimator.inputs.pandas_input_fn(
        x = bandgap_predict,
        num_epochs = 1, # Just one epoch
        shuffle = False)

pred_input_fn = create_pred_input_fn()
predict_bdg = pd.DataFrame(estimator.predict(pred_input_fn))

INFO:tensorflow:Starting evaluation at 2018-02-14-15:23:07
INFO:tensorflow:Restoring parameters from /tmp/tmpL3cLOy/model.ckpt-4000
INFO:tensorflow:Finished evaluation at 2018-02-14-15:23:07
INFO:tensorflow:Saving dict for global step 4000: average_loss = 0.725317, global_step = 4000, loss = 87.038
INFO:tensorflow:Restoring parameters from /tmp/tmpL3cLOy/model.ckpt-4000


In [94]:
predict_bdg.shape

(600, 1)

In [20]:
sample = pd.read_csv('sample_submission.csv')

sample['formation_energy_ev_natom'] = predictions
sample['bandgap_energy_ev'] = predict_bdg
sample.to_csv("submission.csv", index = False)

III. Estimator for Bandgap energy with atomic density

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf

train = pd.read_csv('train.csv')[column_names]
train = train.rename(columns={
                             'lattice_angle_alpha_degree' : 'alpha',
                             'lattice_angle_beta_degree' : 'beta',
                             'lattice_angle_gamma_degree' : 'gamma'})

nomad_predict = pd.read_csv('test.csv')[column_names]  
nomad_predict = nomad_predict.rename(columns = {'lattice_angle_alpha_degree' : 'alpha',
                                                'lattice_angle_beta_degree' : 'beta',
                                                'lattice_angle_gamma_degree' : 'gamma'})

# convert lattice angles from degrees to radians for volume calculation
lattice_angles = ['alpha', 'beta', 'gamma']
for lang in lattice_angles:
    train['_'.join([lang, 'r'])] = np.pi * train[lang] / 180
# compute the cell volumes 
train['vol'] = train['lattice_vector_1_ang'] * train['lattice_vector_2_ang'] * train['lattice_vector_3_ang'] * np.sqrt(
    1 + 2*np.cos(train['alpha_r'])*np.cos(train['beta_r'])*np.cos(train['gamma_r'])
      - np.cos(train['alpha_r'])**2 - np.cos(train['beta_r'])**2 - np.cos(train['gamma_r'])**2)
# calculate the atomic density
# this is known to correlate with stability or bonding strength
train['atomic_density'] = train['number_of_total_atoms'] / train['vol']   

#
lattice_angles = ['alpha', 'beta', 'gamma']
for lang in lattice_angles:
    nomad_predict['_'.join([lang, 'r'])] = np.pi * nomad_predict[lang] / 180
# 
nomad_predict['vol'] = nomad_predict['lattice_vector_1_ang'] * nomad_predict['lattice_vector_2_ang'] * nomad_predict['lattice_vector_3_ang'] * np.sqrt(
    1 + 2*np.cos(nomad_predict['alpha_r'])*np.cos(nomad_predict['beta_r'])*np.cos(nomad_predict['gamma_r'])
      - np.cos(nomad_predict['alpha_r'])**2 - np.cos(nomad_predict['beta_r'])**2 - np.cos(nomad_predict['gamma_r'])**2)
# 
nomad_predict['atomic_density'] = nomad_predict['number_of_total_atoms'] / nomad_predict['vol']   

In [71]:
train.head(5)

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,alpha,beta,gamma,alpha_r,beta_r,gamma_r,vol,atomic_density
0,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,1.570842,1.570836,1.570826,781.052081,0.102426
1,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,1.571121,1.570761,2.094439,782.50011,0.102236
2,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,1.587705,1.590393,0.532648,391.227531,0.102242
3,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,1.570601,1.571004,2.094425,293.377334,0.102257
4,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,1.570727,1.570807,2.094208,944.713843,0.084682


In [29]:
nomad_predict.head(5)

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,alpha,beta,gamma,alpha_r,beta_r,gamma_r,vol,atomic_density
0,33,80.0,0.1875,0.4688,0.3438,10.5381,9.0141,9.6361,89.9997,90.0003,90.0006,1.570791,1.570802,1.570807,915.34747,0.087399
1,33,80.0,0.75,0.25,0.0,9.8938,8.5014,9.1298,90.0038,90.0023,90.0015,1.570863,1.570836,1.570823,767.917987,0.104178
2,167,30.0,0.6667,0.1667,0.1667,4.9811,4.9808,13.4799,89.99,90.0109,120.0014,1.570622,1.570987,2.09442,289.624655,0.103582
3,12,80.0,0.5625,0.4375,0.0,24.337,6.0091,5.762,89.9995,103.8581,90.0002,1.570788,1.812666,1.5708,818.126773,0.097784
4,12,80.0,0.1875,0.5,0.3125,24.6443,6.2906,6.1589,90.0,104.5929,90.0001,1.570796,1.82549,1.570798,923.997043,0.08658


In [5]:
density_feature_columns = [
    
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('spacegroup'), 
        boundaries = [30, 50, 175, 200, 225]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('number_of_total_atoms'), 
        boundaries = [15, 25, 35, 45, 75]),
    
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('percent_atom_al'), 
        boundaries = [0.1667, 0.3854, 0.5833]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('percent_atom_ga'), 
        boundaries = [0.0938, 0.3086, 0.4688]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('percent_atom_in'), 
        boundaries = [0.0625, 0.3060, 0.4688]),
        
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('lattice_vector_1_ang'), 
        boundaries = [6.141, 9.537, 10.292]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('lattice_vector_2_ang'), 
        boundaries = [5.834, 6.383, 9.093]),
    tf.feature_column.bucketized_column(
        source_column = tf.feature_column.numeric_column('lattice_vector_3_ang'), 
        boundaries = [9.298, 10.125, 14.372]),
    
    # numeric features    
    tf.feature_column.numeric_column('alpha'),
    tf.feature_column.numeric_column('beta'),
    tf.feature_column.numeric_column('gamma'),
    
    tf.feature_column.numeric_column('atomic_density'),
   
]

density_column_names = [
  'spacegroup', 
  'number_of_total_atoms', 'percent_atom_al', 'percent_atom_ga', 'percent_atom_in', 
  'lattice_vector_1_ang', 'lattice_vector_2_ang', 'lattice_vector_3_ang', 
  'alpha', 'beta', 'gamma', 'atomic_density'  
]

train_density, bandgap = train[density_column_names], pd.read_csv('train.csv')['bandgap_energy_ev']
bg_train, bg_test, bandgap_train, bandgap_test = train_test_split(
    train_density, bandgap, test_size = 0.25, random_state = 33)

def create_train_input_fn(): 
    return tf.estimator.inputs.pandas_input_fn(
        x = bg_train,
        y = bandgap_train, 
        batch_size = 32,
        num_epochs = None, # Repeat forever
        shuffle = True)

def create_test_input_fn():
    return tf.estimator.inputs.pandas_input_fn(
        x = bg_test,
        y = bandgap_test, 
        num_epochs = 1, # Just one epoch
        shuffle = False)

density_estimator = tf.estimator.DNNRegressor(
    feature_columns = density_feature_columns,
    hidden_units = [1024, 512, 256, 128, 256])

train_input_fn = create_train_input_fn()
density_estimator.train(train_input_fn, steps = 2600)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8748362350>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpcLHtbA', '_save_summary_steps': 100}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpcLHtbA/model.ckpt.
INFO:tensorflow:loss = 287.761, step = 1
INFO:tensorflow:global_step/sec: 40.337
INFO:tensorflow:loss = 25.5019, step = 101 (2.489 sec)
INFO:tensorflow:global_step/sec: 41.2433
INFO:tensorflow:loss = 38.43, step = 201 (2.417 sec)
INFO:tensorflow:global_step/sec: 41.2764
INFO:tensorflow:loss = 36.6823, ste

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7f874835fd10>

In [6]:
test_input_fn = create_test_input_fn()
density_estimator.evaluate(test_input_fn)

def create_pred_input_fn():
    return tf.estimator.inputs.pandas_input_fn(
        x = nomad_predict,
        num_epochs = 1, # Just one epoch
        shuffle = False)

pred_input_fn = create_pred_input_fn()
predict_density = pd.DataFrame(density_estimator.predict(pred_input_fn))

INFO:tensorflow:Starting evaluation at 2018-02-15-15:29:54
INFO:tensorflow:Restoring parameters from /tmp/tmpcLHtbA/model.ckpt-3000
INFO:tensorflow:Finished evaluation at 2018-02-15-15:29:55
INFO:tensorflow:Saving dict for global step 3000: average_loss = 0.149597, global_step = 3000, loss = 17.9517
INFO:tensorflow:Restoring parameters from /tmp/tmpcLHtbA/model.ckpt-3000


In [7]:
sample = pd.read_csv('sample_submission.csv')

sample['formation_energy_ev_natom'] = predictions
sample['bandgap_energy_ev'] = predict_density
sample.to_csv("submission.csv", index = False)

sample.to_csv("subm1024.csv", index = False)

Ionization potentials (eV) for free atoms calculated using the local density approximation (LDA)