In [1]:
%matplotlib inline

In [2]:
import os
import sys

In [3]:
print sys.version

2.7.14 (default, Feb 15 2018, 20:22:28) 
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.39.2)]


In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [5]:
import xgboost as xgb
import pymatgen as mg
import datetime as dt

Pymatgen will drop Py2k support from v2019.1.1. Pls consult the documentation
at https://www.pymatgen.org for more details.
  at https://www.pymatgen.org for more details.""")


In [6]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [7]:
from utils import rmsle

In [8]:
from dataload import load_features

### Load data

In [9]:
DATA_DIR = './data'

In [10]:
train, test = load_features(DATA_DIR, with_ext=True, with_geo=True)

### Build train and test data sets

In [11]:
X_train = train.drop(['id', 'natoms', 'spacegroup',
                      'alpha', 'beta', 'gamma',
                      'ga', 'o_cnt', 'cellvol', 'o_fraction', 'avg_mass',
                      'bandgap', 'E0'], axis=1)
X_test = test.drop(['id', 'natoms', 'spacegroup',
                    'alpha', 'beta', 'gamma',
                    'ga', 'o_cnt', 'cellvol', 'o_fraction', 'avg_mass'], axis=1)

In [12]:
# Use log1p of energies to correct for skew
y_bg_train = train['bandgap']
y_e0_train = np.log1p(train['E0'])

In [13]:
# One-hot encode spacegroup_natoms
X_train = pd.concat([X_train.drop('spacegroup_natoms', axis=1),
                    pd.get_dummies(X_train['spacegroup_natoms'])], axis=1)
X_test = pd.concat([X_test.drop('spacegroup_natoms', axis=1),
                    pd.get_dummies(X_test['spacegroup_natoms'])], axis=1)

### Build Sklearn Model with XGBRegressor

In [14]:
param = {'learning_rate': 0.05,      # Step size shrinkage used in update (Learning rate)
         'reg_alpha': 0.01,          # L1 regularization term on weights
         'n_estimators': 300,
         'max_depth': 5,
         'subsample': 1,
         'colsample_bytree': 0.90,
         'colsample_bylevel': 0.90,
         'silent': True,
         'random_state': 42,
         'objective': 'reg:linear'}

In [15]:
# Estimator pipeline
est = Pipeline([
    ('scaler', StandardScaler()),
    ('xgbreg', XGBRegressor(**param)),
])

In [16]:
est

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbreg', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=0.9, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
       reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [36]:
# Perform grid search for bandgap model
t_md = [3, 4, 5, 6, 7]
t_lr = [0.01, 0.02, 0.05, 0.10, 0.20, 0.3]
t_ne = range(50, 500, 50)
gridsearch_bg_0 = GridSearchCV(est,
                              {'xgbreg__max_depth': t_md,
                               'xgbreg__learning_rate': t_lr,
                               'xgbreg__n_estimators': t_ne,
                              },
                              cv=5,
                              n_jobs=4,
                              scoring='neg_mean_squared_error',
                              return_train_score=False,
                              verbose=True)

In [37]:
gridsearch_bg_0.fit(X_train, y_bg_train)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   20.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  6.5min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 11.2min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 17.4min
[Parallel(n_jobs=4)]: Done 1350 out of 1350 | elapsed: 19.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbreg', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_...    reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'xgbreg__n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450], 'xgbreg__max_depth': [3, 4, 5, 6, 7], 'xgbreg__learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=True)

In [38]:
gridsearch_bg_0.best_params_

{'xgbreg__learning_rate': 0.02,
 'xgbreg__max_depth': 4,
 'xgbreg__n_estimators': 450}

In [39]:
-gridsearch_bg_0.best_score_

0.046612698099393265

In [43]:
# Check best regularization
t_ra = [0.1, 0.05, 0.01, 0.005, 0.001]
gridsearch_bg_1 = GridSearchCV(est,
                              {'xgbreg__reg_alpha': t_ra,
                              },
                              cv=5,
                              n_jobs=4,
                              scoring='neg_mean_squared_error',
                              return_train_score=False,
                              verbose=True)

In [44]:
gridsearch_bg_1.fit(X_train, y_bg_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:   37.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbreg', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=0.9, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=4, min_child...    reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'xgbreg__reg_alpha': [0.1, 0.05, 0.01, 0.005, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=True)

In [45]:
gridsearch_bg_1.best_params_

{'xgbreg__reg_alpha': 0.05}

In [46]:
-gridsearch_bg_1.best_score_

0.045999862165971676

In [47]:
# Perform grid search for formation energy model
t_md = [3, 4, 5, 6, 7]
t_lr = [0.01, 0.02, 0.05, 0.10, 0.20, 0.3]
t_ne = range(50, 500, 50)
gridsearch_e0_0 = GridSearchCV(est,
                              {'xgbreg__max_depth': t_md,
                               'xgbreg__learning_rate': t_lr,
                               'xgbreg__n_estimators': t_ne,
                              },
                              cv=5,
                              n_jobs=4,
                              scoring='neg_mean_squared_error',
                              return_train_score=False,
                              verbose=True)

In [48]:
gridsearch_e0_0.fit(X_train, y_e0_train)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   25.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  6.7min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 12.3min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 18.8min
[Parallel(n_jobs=4)]: Done 1350 out of 1350 | elapsed: 20.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbreg', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=0.9, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=4, min_child...    reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'xgbreg__n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450], 'xgbreg__max_depth': [3, 4, 5, 6, 7], 'xgbreg__learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=True)

In [49]:
gridsearch_e0_0.best_params_

{'xgbreg__learning_rate': 0.05,
 'xgbreg__max_depth': 5,
 'xgbreg__n_estimators': 300}

In [50]:
-gridsearch_e0_0.best_score_

0.0008566715622741059

In [52]:
# Check best regularization
t_ra = [0.2, 0.1, 0.05, 0.01, 0.005, 0.001]
gridsearch_e0_1 = GridSearchCV(est,
                              {'xgbreg__reg_alpha': t_ra,
                              },
                              cv=5,
                              n_jobs=4,
                              scoring='neg_mean_squared_error',
                              return_train_score=False,
                              verbose=True)

In [53]:
gridsearch_e0_1.fit(X_train, y_e0_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   37.8s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbreg', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=0.9, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=4, min_child...    reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'xgbreg__reg_alpha': [0.2, 0.1, 0.05, 0.01, 0.005, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=True)

In [54]:
gridsearch_e0_1.best_params_

{'xgbreg__reg_alpha': 0.1}

In [55]:
-gridsearch_e0_1.best_score_

0.0008553326566301517

### Fit models using optimized hyperparameters

In [106]:
pa_bg = {'learning_rate': 0.02,      # Step size shrinkage used in update (Learning rate)
         'reg_alpha': 0.05,          # L1 regularization term on weights
         'n_estimators': 625,
         'max_depth': 3,
         'subsample': 1,
         'colsample_bytree': 0.90,
         'colsample_bylevel': 0.90,
         'silent': True,
         'random_state': 42,
         'objective': 'reg:linear'}
pa_e0 = {'learning_rate': 0.05,      # Step size shrinkage used in update (Learning rate)
         'reg_alpha': 0.10,          # L1 regularization term on weights
         'n_estimators': 300,
         'max_depth': 4,
         'subsample': 1,
         'colsample_bytree': 0.90,
         'colsample_bylevel': 0.90,
         'silent': True,
         'random_state': 42,
         'objective': 'reg:linear'}

In [107]:
# Estimator pipeline
est_bg = Pipeline([
    ('scaler', StandardScaler()),
    ('xgbreg', XGBRegressor(**pa_bg)),
])
est_e0 = Pipeline([
    ('scaler', StandardScaler()),
    ('xgbreg', XGBRegressor(**pa_e0)),
])

In [108]:
est_bg.fit(X_train, y_bg_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbreg', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=0.9, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=625,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
       reg_alpha=0.05, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [109]:
est_e0.fit(X_train, y_e0_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbreg', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=0.9, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
       reg_alpha=0.1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

### Output predictions

In [110]:
predicted_bg = est_bg.predict(X_test)
predicted_e0 = np.expm1(est_e0.predict(X_test))

In [111]:
predicted = pd.DataFrame({'formation_energy_ev_natom': predicted_e0,
                          'bandgap_energy_ev': predicted_bg}) \
              .reset_index().rename(columns={'index': 'id'})
predicted['id'] += 1

In [112]:
predicted.head()

Unnamed: 0,id,bandgap_energy_ev,formation_energy_ev_natom
0,1,1.491695,0.225013
1,2,3.653866,0.07504
2,3,3.435462,0.174545
3,4,3.047779,0.031543
4,5,1.523106,0.150785


In [113]:
err_e0 = rmsle(np.expm1(est_e0.predict(X_train)), np.expm1(y_e0_train))
err_bg = rmsle(est_bg.predict(X_train), y_bg_train)

In [114]:
# Training RMSLE values
print "RMSLE BG: {}, RMSLE E0: {}, RMSLE AVG: {}".format(err_bg, err_e0,
                                                         0.5 * (err_bg + err_e0))

RMSLE BG: 0.0652129454193, RMSLE E0: 0.016573680748, RMSLE AVG: 0.0408933130836


#### Write to file:

In [115]:
now = dt.datetime.now().strftime("%Y%m%d-%H%M%S")
predicted.to_csv(os.path.join('output', 'xgb-ext-skl-geo-{}.csv'.format(now)),
                columns=['id', 'formation_energy_ev_natom', 'bandgap_energy_ev'],
                index=False)