### Import libraries and data

In [None]:
import pandas as pd
import csv
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
vols = pd.read_csv("volumes.csv")
vols = vols.drop("Unnamed: 0", axis=1)

In [None]:
temps = pd.read_csv("temperatures.csv")
temps = temps.drop("Unnamed: 0", axis=1)

In [None]:
#headers = ["Ru", "Rb", "Rh", "Be", "Ba", "Bi", "Br", "H", "P", "Ge", "Gd", "Ga", "Pr", "Pu", "C",\
#"Pd", "Cd", "Ho", "Mg", "Mo", "Mn", "O", "S", "Eu", "Zr", "Er", "Ni",\
#"Na", "Nb", "Nd", "Fe", "B", "F", "Sr", "N", "Si", "Sn", "Sm", "V", "Sc", "Sb", "Se", "Co",\
#"Cl", "Ca", "Ce", "Xe", "Cs", "Cr", "Cu", "La", "Li", "Tm", "Ti", "Te", "Tb", "Tc", "Yb", "Dy",\
#"I", "Y", "Ag", "Al", "As", "In"]

headers = ["H","Li","Be","B","C","N","O","F","Na","Mg","Al","Si","P","S","Cl",\
"K","Ca","Sc","Ti","V","Cr","Mn","Fe","Co","Ni","Cu","Zn","Ga",\
"Ge","As","Se","Br","Rb","Sr","Y","Zr","Nb","Mo",\
"Tc","Ru","Rh","Pd","Ag","Cd","In","Sn",\
"Sb","Te","I","Xe","Cs","Ba","La","Ce","Pr",\
"Nd","Sm","Eu","Gd","Tb","Dy","Ho","Er",\
"Tm","Yb","Lu","Hf","Ta","W","Re","Os","Ir",\
"Pt","Au","Hg","Tl","Pb","Bi","Ac","Th","Pa",\
"U","Np","Am"] #hofmann

#headers = ["Ru", "Rb", "Rh", "Be", "Ba", "Bi", "Br", "H", "P", "Ge", "Gd", "Ga", "Pr", "Pu", "C",\
#"Pd", "Cd", "Ho", "Mg", "Mo", "Mn", "O", "S", "Eu", "Zr", "Er", "Ni",\
#"Na", "Nb", "Nd", "Fe", "B", "F", "Sr", "N", "Si", "Sn", "Sm", "V", "Sc", "Sb", "Se", "Co",\
#"Cl", "Ca", "Ce", "Xe", "Cs", "Cr", "Cu"]


unitformula = pd.read_csv("scaledformulae.csv", usecols=headers)
#unitformula = unitformula.drop("Unnamed: 0", axis=1)

### Setting models

In [None]:
from sklearn import linear_model, svm, kernel_ridge, ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(unitformula, vols)

In [None]:
#reg = ensemble.ExtraTreesRegressor(n_estimators=100, n_jobs=-1)
#reg.fit(X_train, y_train)

In [None]:
estimator = [(linear_model.RANSACRegressor(linear_model.LinearRegression())),\
             (linear_model.RANSACRegressor(linear_model.Ridge(alpha=100))), (RandomForestRegressor()),\
             (ensemble.GradientBoostingRegressor()), (ensemble.ExtraTreesRegressor())]
for reg in estimator:
    #X_train, X_test, y_train, y_test = train_test_split(unitformula, vols, random_state = 24)
    reg.fit(X_train, y_train)
    pdv = reg.predict(X_test)
    print "Using: ", reg
    print reg.score(X_test,y_test)
    pdvd = pd.DataFrame(pdv, columns = ["Pred Vol"])
    ytest = pd.DataFrame(y_test)
    ytest = ytest.reset_index(drop=True)
    pdvd["Pred Vol"].div(ytest["Volume"]).hist(bins=50)
    print pdvd["Pred Vol"].div(ytest["Volume"]).describe()
    plt.show()

Here is the same loop but with the introduction of two new modelling methods, the gradient boosting and extra trees.
Extra trees works in the same way as random forest but the branching of the tree is more randomized, meaning it is a bit more efficient and better against overfitting.

Gradient boosting works by building small trees (weak predictors) and each subsequent tree is an improvement on the error of the previous one, multiple of these are produced. A final strong predictor is produced based on the weightings of each of the weak predictors