In [2]:
## Prepare the data
import pandas as pd
import numpy as np

#skip reading row 39 of the file because Cs4A6S5. A is not an element
dataset = pd.read_csv(r'Piezoelectric_renewed.csv', skiprows=[38, 305])


from matminer.featurizers.conversions import StrToComposition

X = dataset[['Materials', 'Piezoelectric_Modulus', 'Crystal_Symmetry', 'mp_id']]
X = StrToComposition().featurize_dataframe(X, 'Materials')

from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name='magpie')
X = ep_feat.featurize_dataframe(X, col_id='composition')  # input the "composition" column to the featurizer


from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates


X = CompositionToOxidComposition().featurize_dataframe(X, "composition")

os_feat = OxidationStates()
X = os_feat.featurize_dataframe(X, "composition_oxid")
X.head()

StrToComposition:   0%|          | 0/1703 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/1703 [00:00<?, ?it/s]

CompositionToOxidComposition:   0%|          | 0/1703 [00:00<?, ?it/s]

OxidationStates:   0%|          | 0/1703 [00:00<?, ?it/s]

Unnamed: 0,Materials,Piezoelectric_Modulus,Crystal_Symmetry,mp_id,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,...,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,composition_oxid,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state
0,GaBN2,0.739662,tetragonal,mp-1007823,"(Ga, B, N)",5.0,31.0,26.0,12.5,9.25,...,194.0,130.0,154.5,45.25,194.0,"(Ga3+, B3+, N3-)",-3,3,6,3.794733
1,BC2N,0.65775,tetragonal,mp-1008523,"(B, C, N)",5.0,7.0,2.0,6.0,0.5,...,194.0,28.0,187.0,10.5,194.0,"(B3+, C4+, C4-, N3-)",-4,4,8,4.082483
2,B2AsP,0.054518,tetragonal,mp-1008528,"(B, As, P)",5.0,33.0,28.0,14.5,9.5,...,166.0,164.0,125.0,61.5,166.0,"(B3+, As3-, P3-)",-3,3,6,3.794733
3,AlGaN2,0.480111,tetragonal,mp-1008556,"(Al, Ga, N)",7.0,31.0,24.0,14.5,8.25,...,225.0,161.0,169.25,52.625,194.0,"(Al3+, Ga3+, N3-)",-3,3,6,3.794733
4,NdBiPd,0.437877,cubic,mp-1008858,"(Nd, Bi, Pd)",46.0,83.0,37.0,63.0,13.333333,...,225.0,213.0,143.666667,87.777778,12.0,"(Nd0+, Bi0+, Pd0+)",0,0,0,0.0


In [3]:
print(X)
print(ep_feat)
print(X.columns)

     Materials  Piezoelectric_Modulus Crystal_Symmetry       mp_id  \
0        GaBN2               0.739662       tetragonal  mp-1007823   
1         BC2N               0.657750       tetragonal  mp-1008523   
2        B2AsP               0.054518       tetragonal  mp-1008528   
3       AlGaN2               0.480111       tetragonal  mp-1008556   
4       NdBiPd               0.437877            cubic  mp-1008858   
...        ...                    ...              ...         ...   
1698    Na3PS4               0.928830            cubic   mp-985584   
1699  Zn3Sn2O7               1.123010     orthorhombic    mvc-3343   
1700   Zn3W2O7               0.362771     orthorhombic    mvc-3714   
1701  Te(WO4)2               3.132885        triclinic     mvc-667   
1702  Zn2Sn3O8               0.227761        hexagonal    mvc-7701   

       composition  MagpieData minimum Number  MagpieData maximum Number  \
0       (Ga, B, N)                        5.0                       31.0   
1      

In [4]:
X["Crystal_Symmetry"].unique()

array(['tetragonal', 'cubic', 'monoclinic', 'orthorhombic', 'trigonal',
       'hexagonal', 'triclinic'], dtype=object)

In [5]:
## Split data into training and testing data

y = X['Piezoelectric_Modulus'].values #labels

excluded = ["Materials", "Piezoelectric_Modulus", "Crystal_Symmetry", "mp_id", "composition", 'composition_oxid']
Xx = X.drop(excluded, axis=1) #features
#print("There are {} possible descriptors:\n\n{}".format(Xx.shape[1], Xx.columns.values))
feature_list = list(Xx.columns)

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_Xx, test_Xx, train_y, test_y = train_test_split(Xx, y, test_size = 0.25, random_state = 42)

print('Training Features Shape:', train_Xx.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_Xx.shape)
print('Testing Labels Shape:', test_y.shape)

Training Features Shape: (1277, 136)
Training Labels Shape: (1277,)
Testing Features Shape: (426, 136)
Testing Labels Shape: (426,)


In [None]:
# The baseline predictions are the historical averages
baseline_preds = y.mean()
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_y)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score

#This one takes a while to run. The larger n_estimators, the more trees and the longer it takes.
regressor = RandomForestRegressor(n_estimators=200, random_state=5)
regressor.fit(train_Xx, train_y)
predictions = regressor.predict(test_Xx)

# Calculate the absolute error
errors = abs(predictions - test_y)
mse = mean_squared_error(test_y, predictions)

print('Average baseline error: ', round(np.mean(baseline_errors), 5))
print('Mean Absolute Error:', round(np.mean(errors), 5))
print('Mean Squared Error: ', mse)
print('Root mean squared error: ', np.sqrt(mse))
print(r2_score(test_y, predictions))


In [None]:
from matminer.figrecipes.plot import PlotlyFig

pf_rf = PlotlyFig(x_title='True',
                  y_title='RF predict',
                  title='RF Piezoelectric_modulus regression',
                  mode='notebook',
                  filename="rf_regression.html")
pf_rf.xy((test_y, predictions))
#pf_rf.xy((test_y, predictions), 
#      labels=X['Materials'], modes=['markers', 'lines'],
#      lines=[{}, {'color': 'black', 'dash': 'dash'}], showlegends=False)
# pf_rf.write_image("magpie_scatter.pdf")






In [None]:
import plotly.graph_objects as go
import numpy as np
np.random.seed(1)

import plotly.express as px
fig = px.scatter(x=test_y, y=predictions, labels={
                     "x": "Sepal Length (cm)",
                     "y": "Sepal Width (cm)",
                     "species": "Species of Iris"
                 },
                title="Piezoelectric_modulus regression")
fig.show()
fig.write_image("Magpie-scatter.png")


In [None]:
from sklearn.model_selection import cross_val_score


#scores = cross_val_score(lm, Xx, y, scoring='r2', cv=10)
scores = cross_val_score(regressor, Xx, y, scoring='r2', cv=10)
print(scores)
print(sum(scores)/len(scores))

In [None]:
# Regression

# ‘explained_variance’

# metrics.explained_variance_score

# ‘max_error’

# metrics.max_error

# ‘neg_mean_absolute_error’

# metrics.mean_absolute_error

# ‘neg_mean_squared_error’

# metrics.mean_squared_error

# ‘neg_root_mean_squared_error’

# metrics.mean_squared_error

# ‘neg_mean_squared_log_error’

# metrics.mean_squared_log_error

# ‘neg_median_absolute_error’

# metrics.median_absolute_error

# ‘r2’

# metrics.r2_score

# ‘neg_mean_poisson_deviance’

# metrics.mean_poisson_deviance

# ‘neg_mean_gamma_deviance’

# metrics.mean_gamma_deviance

# ‘neg_mean_absolute_percentage_error’



In [None]:
import matplotlib.pyplot as plt
# Plot outputs
plt.scatter(test_y, predictions,  color='black')
# plt.plot(test_y, predictions, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()