In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


In [None]:
def are(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) / y_true)

def msre(y_true, y_pred):
    return np.mean(((y_true - y_pred) / y_true) ** 2)

def rmrse(y_true, y_pred):
    return np.sqrt(msre(y_true, y_pred))

def myeval(y_true, y_pred):
    return are(y_true, y_pred), rmrse(y_true, y_pred)


## Preprocessing

In [None]:
##########################################################
# Change the file path below before running this code cell.
##########################################################

file = r"provide/path/to/CNT_bundle_global_features.xlsx"
df = pd.read_excel(file)
end = df.columns.get_loc('Elastic Modulus (GPa)')
df = df.iloc[:,:end+1] # Split features and targets

df = df.dropna()
df = df[df['# of Atoms'] <= 20000] # Large models have # of Atoms > 20,000.

df_feats = df.iloc[:,:-2].drop(columns=["File name", "Initial Dia (Ang.)", "File #", "Strain at Break", "1/2 Stress (GPa)", "1/2 Strain"])
df_targets = df.iloc[:,-2:]

feats = df_feats.to_numpy()
targets = df_targets.to_numpy()

In [10]:
strength = targets[:,0]
modulus = targets[:,1]

## Training

In [13]:
sregressor=xgb.XGBRegressor(learning_rate = 0.015,
                            n_estimators  = 700,
                            max_depth     = 5)

mregressor = xgb.XGBRegressor(learning_rate = 0.015,
                            n_estimators  = 700,
                            max_depth     = 5)


train_sizes = [0.9]

sresults = {}
mresults= {}

for size in enumerate(train_sizes):

  s_pred_ares = []
  m_pred_ares = []

  for exp in range(5):

    state = size[0]+exp*exp

    sxtrain, sxtest, sytrain, sytest = train_test_split(feats, strength, random_state=state, train_size=size[1])
    mxtrain, mxtest, mytrain, mytest = train_test_split(feats, modulus, random_state=state, train_size=size[1])


    sregressor.fit(sxtrain, sytrain, verbose=False)
    mregressor.fit(mxtrain, mytrain, verbose=False)

    test_spred = sregressor.predict(sxtest[:round(len(feats)*0.25)])
    test_mpred = mregressor.predict(mxtest[:round(len(feats)*0.25)])

    stare, strmrse = myeval(sytest[:round(len(feats)*0.25)], test_spred)
    mtare, mtrmrse = myeval(mytest[:round(len(feats)*0.25)], test_mpred)

    s_pred_ares.append(stare)
    m_pred_ares.append(mtare)

  sresults['s_'+str(size[1])] = [np.mean(s_pred_ares),np.std(s_pred_ares)]
  mresults['m_'+str(size[1])] = [np.mean(m_pred_ares),np.std(m_pred_ares)]

sresults, mresults



({'s_0.9': [0.05152480653394502, 0.022148542558623512]},
 {'m_0.9': [0.06307685632118651, 0.01314471448124323]})

## Usable from command line (if proper preprocessing was performed)

In [None]:
if __name__ == '__main__':

    pt = 's'
    mapping = {'s': 0, 'm': 1}

    feats = np.load('glb_feats.npy')
    targets = np.load('all_targets.npy')[:, mapping[pt]]

    index = np.load('newidx.npy')
    large_idx = np.load('lrg_test_idx.npy')

    num_test = int(0.1 * len(index))

    results = []
    for cf in range(10):
        test_idx = index[cf*num_test : (cf+1)*num_test]
        other_idx = np.append(index[(cf+1)*num_test:], index[:num_test])

        valid_idx = other_idx[:num_test]
        train_idx = other_idx[num_test:]

        valid_set = (feats[valid_idx], targets[valid_idx])
        train_set = (feats[train_idx], targets[train_idx])

        foldresult = []

        for exp in range(5):

            regressor=xgb.XGBRegressor(learning_rate = 0.015,
                                       n_estimators  = 700,
                                       max_depth     = 5)

            regressor.fit(feats[train_idx], targets[train_idx], eval_metric='rmsle',
                          eval_set=[train_set, valid_set], early_stopping_rounds=20, verbose=False)

            test_pred = regressor.predict(feats[test_idx])

            tare, trmrse = eval(targets[test_idx], test_pred)

            test_large = regressor.predict(feats[large_idx])

            lare, lrmrse = eval(targets[large_idx], test_large)

            foldresult.append([tare, trmrse, lare, lrmrse])

        tare, trmrse, lare, lrmrse = np.mean(foldresult, 0)

        print('Fold {:d} | Test ARE {:.2f} | Test RMRSE {:.2f} | Large ARE {:.2f}| Large RMRSE {:.2f}'.format(cf, tare*100, trmrse*100, lare*100, lrmrse*100))

        results.append(foldresult)

    results = np.array(results)

    print(np.mean(results.mean(0)), 0)
    print(np.var(results.reshape(-1, 4)), 0)

    np.save(pt+'result/xgb_valid.npy', results)