In [230]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
computational_df= pd.read_csv('comp.csv')
computational_df.head()

Unnamed: 0,formula,Property band_gap,Property band_gap units,Property SMILES
0,GdCoO$_{3}$,0.1754,eV/atom,
1,PrPt,0.0,eV/atom,
2,Tl$_{2}$SeO$_{4}$,3.3122,eV/atom,
3,Ba$_{5}$(InSb$_{3}$)$_{2}$,0.0,eV/atom,
4,NiP$_{2}$,0.5442,eV/atom,


In [5]:
computational_df.rename(columns={"formula":"chemical_formula","Property band_gap":"band_gap"},inplace=True,errors="raise")

In [6]:
computational_df['chemical_formula']=computational_df['chemical_formula'].astype('str').apply(lambda x:x.translate
                                          ({ord(c): None for c in "_${}"}))

In [7]:
from matminer.featurizers.conversions import StrToComposition
computational_df = StrToComposition().featurize_dataframe(computational_df, 'chemical_formula')

HBox(children=(FloatProgress(value=0.0, description='StrToComposition', max=40317.0, style=ProgressStyle(descr…




In [12]:
from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name="magpie")
computational_df = ep_feat.featurize_dataframe(computational_df, col_id="composition",ignore_errors=True,return_errors=True)  # input the "composition" column to the featurizer
computational_df.head()



HBox(children=(FloatProgress(value=0.0, description='ElementProperty', max=40317.0, style=ProgressStyle(descri…




Unnamed: 0,chemical_formula,band_gap,Property band_gap units,Property SMILES,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,...,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,ElementProperty Exceptions
0,GdCoO3,0.1754,eV/atom,,"(Gd, Co, O)",8.0,64.0,56.0,23.0,18.0,...,0.309694,0.495511,0.0,12.0,194.0,182.0,84.8,87.36,12.0,
1,PrPt,0.0,eV/atom,,"(Pr, Pt)",59.0,78.0,19.0,68.5,9.5,...,0.0,0.0,0.0,194.0,225.0,31.0,209.5,15.5,194.0,
2,Tl2SeO4,3.3122,eV/atom,,"(Tl, Se, O)",8.0,81.0,73.0,32.571429,28.081633,...,0.0,0.0,0.0,12.0,194.0,182.0,64.285714,74.122449,12.0,
3,Ba5(InSb3)2,0.0,eV/atom,,"(Ba, In, Sb)",49.0,56.0,7.0,52.615385,2.60355,...,0.0,0.0,0.0,139.0,229.0,90.0,186.076923,33.017751,166.0,
4,NiP2,0.5442,eV/atom,,"(Ni, P)",15.0,28.0,13.0,19.333333,5.777778,...,0.198465,0.26462,0.0,2.0,225.0,223.0,76.333333,99.111111,2.0,


In [84]:
(computational_df.shape)

(40317, 138)

In [115]:
excluded = [ "Property SMILES", "ElementProperty Exceptions"]
excluded_df = computational_df.drop(excluded,axis=1)
excluded_df=excluded_df.drop(excluded_df[excluded_df.isna().any(axis=1)].index)
excluded_df.shape

(40311, 136)

In [118]:
excluded = [ "Property SMILES", "ElementProperty Exceptions"]
excluded_df = computational_df.drop(excluded,axis=1).dropna()
X = excluded_df.drop(["chemical_formula","band_gap","Property band_gap units","composition"],axis=1)
y= excluded_df['band_gap']
assert len(X)==len(y)

In [123]:
len(X)

40311

In [120]:
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import numpy as np
min_max_scaler = preprocessing.StandardScaler()
X_train_minmax = min_max_scaler.fit_transform(X)
lr = LinearRegression()

lr.fit(X_train_minmax, y)

# get fit statistics
print('training R2 = ' + str(round(lr.score(X_train_minmax, y), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(X_train_minmax))))

training R2 = 0.605
training RMSE = 1.100


In [121]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1)

rf.fit(X, y)
print ('R2 = ' + str(round(rf.score(X, y), 3)))
print ('RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=rf.predict(X))))

R2 = 0.963
RMSE = 0.337


In [159]:
experimental_df= pd.read_csv('experimental.csv')
experimental_df.head()

Unnamed: 0,Chemical formula,Band gap,Crystallinity,Color
0,Bi$_{2}$Te$_{3}$,0.153,Single crystalline,
1,Mg$_{2}$Ge,0.567,Single crystalline,
2,CoSi,0.045,Single crystalline,
3,NaBr,7.025,Single crystalline,
4,Ca$_{2}$Sn,0.9,Polycrystalline,


In [160]:
experimental_df.rename(columns={"Chemical formula":"chemical_formula","Band gap":"band_gap"},inplace=True,errors="raise")

In [161]:
experimental_df['chemical_formula']=experimental_df['chemical_formula'].astype('str') .apply(lambda x:x.translate
                                          ({ord(c): None for c in "_${}"}))
experimental_df['band_gap']=experimental_df['band_gap'].astype('str').apply(lambda t: t[:t.find('$')] if t.find('$')>0 else t).astype(float)

In [162]:
from matminer.featurizers.conversions import StrToComposition
experimental_df = StrToComposition().featurize_dataframe(experimental_df, 'chemical_formula')
from matminer.featurizers.composition import ElementProperty
ep_feat = ElementProperty.from_preset(preset_name="magpie")
experimental_df = ep_feat.featurize_dataframe(experimental_df, col_id="composition")  # input the "composition" column to the featurizer
experimental_df.head()

HBox(children=(FloatProgress(value=0.0, description='StrToComposition', max=1448.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='ElementProperty', max=1448.0, style=ProgressStyle(descrip…




Unnamed: 0,chemical_formula,band_gap,Crystallinity,Color,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Bi2Te3,0.153,Single crystalline,,"(Bi, Te)",52.0,83.0,31.0,64.4,14.88,...,0.0,0.0,0.0,0.0,12.0,152.0,140.0,96.0,67.2,152.0
1,Mg2Ge,0.567,Single crystalline,,"(Mg, Ge)",12.0,32.0,20.0,18.666667,8.888889,...,0.0,0.0,0.0,0.0,194.0,225.0,31.0,204.333333,13.777778,194.0
2,CoSi,0.045,Single crystalline,,"(Co, Si)",14.0,27.0,13.0,20.5,6.5,...,1.548471,0.774236,0.774236,0.0,194.0,227.0,33.0,210.5,16.5,194.0
3,NaBr,7.025,Single crystalline,,"(Na, Br)",11.0,35.0,24.0,23.0,12.0,...,0.0,0.0,0.0,0.0,64.0,229.0,165.0,146.5,82.5,64.0
4,Ca2Sn,0.9,Polycrystalline,,"(Ca, Sn)",20.0,50.0,30.0,30.0,13.333333,...,0.0,0.0,0.0,0.0,141.0,225.0,84.0,197.0,37.333333,225.0


In [163]:
experimental_df=experimental_df.dropna(subset=['band_gap',"Crystallinity"])
experimental_df['Crystallinity']=pd.factorize(experimental_df['Crystallinity'])[0]+1

In [164]:
experimental_df.head()

Unnamed: 0,chemical_formula,band_gap,Crystallinity,Color,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Bi2Te3,0.153,1,,"(Bi, Te)",52.0,83.0,31.0,64.4,14.88,...,0.0,0.0,0.0,0.0,12.0,152.0,140.0,96.0,67.2,152.0
1,Mg2Ge,0.567,1,,"(Mg, Ge)",12.0,32.0,20.0,18.666667,8.888889,...,0.0,0.0,0.0,0.0,194.0,225.0,31.0,204.333333,13.777778,194.0
2,CoSi,0.045,1,,"(Co, Si)",14.0,27.0,13.0,20.5,6.5,...,1.548471,0.774236,0.774236,0.0,194.0,227.0,33.0,210.5,16.5,194.0
3,NaBr,7.025,1,,"(Na, Br)",11.0,35.0,24.0,23.0,12.0,...,0.0,0.0,0.0,0.0,64.0,229.0,165.0,146.5,82.5,64.0
4,Ca2Sn,0.9,2,,"(Ca, Sn)",20.0,50.0,30.0,30.0,13.333333,...,0.0,0.0,0.0,0.0,141.0,225.0,84.0,197.0,37.333333,225.0


In [165]:
excluded = ["band_gap","chemical_formula", "Color", "composition"]
X_experiment= experimental_df.drop(excluded,axis=1)
y_experiment=experimental_df['band_gap'].values

In [167]:
X_experiment.shape

(1275, 133)

In [168]:
y_experiment.shape

(1275,)

In [170]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

lr_baseline_experimental = LinearRegression()

lr_baseline_experimental.fit(X_experiment, y_experiment)

# get fit statistics
print('training R2 = ' + str(round(lr_baseline_experimental.score(X_experiment, y_experiment), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_experiment, y_pred=lr_baseline_experimental.predict(X_experiment))))


training R2 = 0.863
training RMSE = 0.836


In [171]:
from sklearn.model_selection import KFold, cross_val_score
# Use 10-fold cross validation (90% training, 10% test)
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(lr_baseline_experimental, X_experiment, y_experiment, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
r2_scores = cross_val_score(lr_baseline_experimental, X_experiment, y_experiment, scoring='r2', cv=crossvalidation, n_jobs=1)

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))

Cross-validation results:
Folds: 10, mean R2: 0.781
Folds: 10, mean RMSE: 1.019


In [174]:
from sklearn.ensemble import RandomForestRegressor
rf_baseline_experimental = RandomForestRegressor(n_estimators=50, random_state=1)

rf_baseline_experimental.fit(X_experiment, y_experiment)
print ('R2 = ' + str(round(rf_baseline_experimental.score(X_experiment, y_experiment), 3)))
print ('RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_experiment, y_pred=rf_baseline_experimental.predict(X_experiment))))

R2 = 0.971
RMSE = 0.387


In [177]:
excluded = ["chemical_formula", "Color", "composition","Crystallinity"]
X_experiment_crystal= experimental_df.drop(excluded,axis=1)
y_experiment_crystal=experimental_df['Crystallinity'].values

In [234]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

rf_baseline_experimental_crystal = RandomForestClassifier(n_estimators=50, random_state=1)

X_train, X_test, y_train, y_test = train_test_split(X_experiment_crystal, y_experiment_crystal, test_size=.4, random_state=42)
rf_baseline_experimental_crystal.fit(X_train, y_train)
f1_score(y_test,rf_baseline_experimental_crystal.predict(X_test),average='weighted')

0.7711441684688172

In [235]:
crossvalidation = KFold(n_splits=8, shuffle=True, random_state=1)
scores = cross_val_score(rf_baseline_experimental_crystal, X_experiment_crystal, y_experiment_crystal,
                         scoring='f1_micro', cv=crossvalidation, n_jobs=1)
f1_scores = cross_val_score(rf_baseline_experimental_crystal, X_experiment_crystal, y_experiment_crystal, scoring='f1_micro', cv=crossvalidation, n_jobs=1)
f1_scores.mean()

0.8086035770440252

In [214]:
excluded = ["chemical_formula", "Color", "composition"]
df= experimental_df.dropna(subset=["Color"])
df['Color']=pd.factorize(df['Color'])[0]+1
X_experiment_color= df.drop(excluded,axis=1)
y_experiment_color=df['Color'].values
assert len(X_experiment_color) ==len(y_experiment_color)

In [215]:
X_experiment_color

Unnamed: 0,band_gap,Crystallinity,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
12,0.404,1,33.0,49.0,16.0,41.000000,8.000000,33.0,75.0,84.0,...,0.0,0.0,0.0,0.0,139.0,166.0,27.0,152.500000,13.500000,139.0
14,1.320,2,16.0,57.0,41.0,32.400000,19.680000,16.0,13.0,88.0,...,0.0,0.0,0.0,0.0,70.0,194.0,124.0,119.600000,59.520000,70.0
19,6.100,1,8.0,20.0,12.0,14.000000,6.000000,8.0,7.0,87.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,118.500000,106.500000,12.0
26,1.250,1,34.0,40.0,6.0,35.500000,2.250000,34.0,44.0,89.0,...,0.0,0.0,0.0,0.0,14.0,194.0,180.0,59.000000,67.500000,14.0
30,2.690,1,32.0,34.0,2.0,33.333333,0.888889,34.0,79.0,89.0,...,0.0,0.0,0.0,0.0,14.0,225.0,211.0,84.333333,93.777778,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1377,1.960,1,16.0,72.0,56.0,34.666667,24.888889,16.0,45.0,88.0,...,0.0,0.0,0.0,0.0,70.0,194.0,124.0,111.333333,55.111111,70.0
1387,0.500,1,5.0,13.0,8.0,9.000000,4.000000,5.0,72.0,73.0,...,0.0,0.0,0.0,0.0,166.0,225.0,59.0,195.500000,29.500000,166.0
1390,0.910,1,34.0,50.0,16.0,42.000000,8.000000,34.0,80.0,89.0,...,0.0,0.0,0.0,0.0,14.0,141.0,127.0,77.500000,63.500000,14.0
1405,2.670,1,30.0,34.0,4.0,32.000000,2.000000,30.0,69.0,89.0,...,0.0,0.0,0.0,0.0,14.0,194.0,180.0,104.000000,90.000000,14.0


In [216]:
y_experiment_color

array([ 1,  2,  3,  1,  4,  5,  3,  6,  7,  8,  5,  9, 10,  6, 10,  5, 11,
        5, 12,  6,  5, 10,  1, 10,  6,  3, 10,  5,  3,  6, 10,  6,  3, 10,
       13, 10, 14,  5, 15, 13, 16,  5, 10, 17,  6,  6,  6,  4, 14, 18, 19,
       11,  3, 10,  1,  5, 20,  6,  5,  5, 19,  3, 10,  5,  1,  1,  3,  3,
        9, 10,  3, 10,  5,  5,  5,  6, 21, 13, 19, 18,  7,  5, 10, 18,  1,
        5,  3,  3, 22,  3, 10, 10,  5,  5,  6, 13,  5,  3, 10,  5,  1, 10,
       13,  5, 23,  6, 18, 10,  3, 10,  3,  8,  5, 10,  3,  1, 18,  6, 24,
       10, 19,  7,  6, 10,  7])

In [232]:
rf_baseline_experimental_color = RandomForestClassifier(n_estimators=50, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X_experiment_color, y_experiment_color, test_size=.4, random_state=42)
rf_baseline_experimental_color.fit(X_train, y_train)
f1_score(y_test,rf_baseline_experimental_color.predict(X_test),average='weighted')

0.5062937062937063

In [237]:
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=1)
scores = cross_val_score(rf_baseline_experimental_color, X_experiment_color, y_experiment_color,
                         scoring='f1_micro', cv=crossvalidation, n_jobs=1)
f1_scores = cross_val_score(rf_baseline_experimental_color, X_experiment_color, y_experiment_color, scoring='f1_micro', cv=crossvalidation, n_jobs=1)
f1_scores.mean()

0.56