In [None]:
import warnings
import numpy as np
import pandas as pd
import pickle
### Pymatgen imports ###
from pymatgen.ext.matproj import MPRester
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.base import MultipleFeaturizer

### Sklearn imports ###
from sklearn import ensemble
from sklearn.model_selection import KFold , cross_val_score
from sklearn.model_selection import cross_val_predict

In [None]:
API_key = 'XXXXXXXXXXXXX'
warnings.filterwarnings('ignore')
m = MPRester (API_key)
mpr = MPDataRetrieval(API_key)

In [None]:
criteria = {'elements': {'$in': ['S']}, 'band_gap':{ '$gt': 0.2}}
properties = ['task_id','formula', 'elements', 'anonymous_formula', 'formation_energy_per_atom', 'e_above_hull']

In [None]:
dataset = mpr.get_dataframe ( criteria = criteria , properties = properties )

In [None]:
stc = StrToComposition()
stc.featurize_dataframe(dataset, col_id='formula', inplace=True)
feature_calculators = MultipleFeaturizer ([cf.Stoichiometry(), cf. ElementProperty.from_preset("magpie")])

feature_labels = feature_calculators.feature_labels()
sulfides = feature_calculators.featurize_dataframe(dataset, col_id = 'composition', ignore_errors = True)

In [None]:
# Alphabetise columns to make it easier later for use on new data
sulfides = sulfides.reindex(sorted(sulfides.columns), axis =1)

# Choose columns to train on
X_cols = [c for c in sulfides.columns if c not in ['task_id', 'formula', 'elements', 'anonymous_formula', 'formation_energy_per_atom', 'e_above_hull', 'composition']]

y = sulfides['formation_energy_per_atom'].values
X = sulfides[X_cols].values

In [None]:
gbr = ensemble.GradientBoostingRegressor (max_depth = 20,
learning_rate = 0.014485 ,
min_samples_split = 65,
min_samples_leaf = 1,
max_features = 86,
subsample = 0.9 , n_estimators = 100)

gbr.fit(X, y)
X = sulfides[X_cols].values

In [None]:
# Decide how to do cross-validation
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)

scores = cross_val_score(gbr, X, y, scoring='neg_mean_squared_error',
                                 cv=crossvalidation, n_jobs=5)
average_score = np.mean(scores)
print('GBR model RMSE: {}'.format(np.sqrt(abs(average_score))))

In [None]:
filename = 'finalized_model.sav'
pickle.dump(gbr, open(filename, 'wb'))