### Import libraries and data

In [2]:
import pandas as pd
import csv
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
vols = pd.read_csv("volumes300.csv")
vols = vols.drop("Unnamed: 0", axis=1)

In [4]:
temps = pd.read_csv("temperatures300.csv")
temps = temps.drop("Unnamed: 0", axis=1)

In [5]:
#headers = ["Ru", "Rb", "Rh", "Be", "Ba", "Bi", "Br", "H", "P", "Ge", "Gd", "Ga", "Pr", "Pu", "C",\
#"Pd", "Cd", "Ho", "Mg", "Mo", "Mn", "O", "S", "Eu", "Zr", "Er", "Ni",\
#"Na", "Nb", "Nd", "Fe", "B", "F", "Sr", "N", "Si", "Sn", "Sm", "V", "Sc", "Sb", "Se", "Co",\
#"Cl", "Ca", "Ce", "Xe", "Cs", "Cr", "Cu", "La", "Li", "Tm", "Ti", "Te", "Tb", "Tc", "Yb", "Dy",\
#"I", "Y", "Ag", "Al", "As", "In"]

headers = ["H","Li","Be","B","C","N","O","F","Na","Mg","Al","Si","P","S","Cl",\
"K","Ca","Sc","Ti","V","Cr","Mn","Fe","Co","Ni","Cu","Zn","Ga",\
"Ge","As","Se","Br","Rb","Sr","Y","Zr","Nb","Mo",\
"Tc","Ru","Rh","Pd","Ag","Cd","In","Sn",\
"Sb","Te","I","Xe","Cs","Ba","La","Ce","Pr",\
"Nd","Sm","Eu","Gd","Tb","Dy","Ho","Er",\
"Tm","Yb","Lu","Hf","Ta","W","Re","Os","Ir",\
"Pt","Au","Hg","Tl","Pb","Bi","Ac","Th","Pa",\
"U","Np","Am"] #hofmann

#headers = ["Ru", "Rb", "Rh", "Be", "Ba", "Bi", "Br", "H", "P", "Ge", "Gd", "Ga", "Pr", "Pu", "C",\
#"Pd", "Cd", "Ho", "Mg", "Mo", "Mn", "O", "S", "Eu", "Zr", "Er", "Ni",\
#"Na", "Nb", "Nd", "Fe", "B", "F", "Sr", "N", "Si", "Sn", "Sm", "V", "Sc", "Sb", "Se", "Co",\
#"Cl", "Ca", "Ce", "Xe", "Cs", "Cr", "Cu"]


unitformula = pd.read_csv("scaledformulae300.csv", usecols=headers)
#unitformula = unitformula.drop("Unnamed: 0", axis=1)

As seen there has been a change to the csv files being used; it has been further filtered of things such as disordered structures; those with a R factor of over 10%; polymeric structures etc.

This brings the total entries being used from around 500,000 to 300,000.

### Setting models

In [6]:
from sklearn import linear_model, svm, kernel_ridge, ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

Cross_val_score is a scoring method where similar to the train_test_split the data is sectioned off. In cross validation the test set is picked randomly from the dataset; this is repeated multiple times.

In [7]:
hoffX = pd.read_csv("hofmanndataframe.csv")
hoffy = pd.read_csv("hofmannvols.csv")

The Hofmann values have been collated as a csv file with the goal of calculating predicted volumes & scoring it, in a different way to before.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(unitformula, vols, random_state=24)
hoffX_train, hoffX_test, hoffy_train, hoffy_test = train_test_split(hoffX, hoffy)

### Main algorithm

In [None]:
estim = [(linear_model.RANSACRegressor(linear_model.LinearRegression())),\
          (linear_model.RANSACRegressor(linear_model.Ridge(alpha=100))), (RandomForestRegressor()),\
          (ensemble.GradientBoostingRegressor()), (ensemble.ExtraTreesRegressor(n_estimators=100, n_jobs=-1))]
#try huber too

for reg in estim:
    print "\nUsing: ", reg
    scores = cross_val_score(reg, unitformula, vols)
    print "\nMean: ", scores.mean(), "STD: ", scores.std()

This loop is not using TTS rather the whole dataset is read and the scores are cross validated; this is looped for each regressor. The average score and standard deviation are outputted

In [None]:
estimator = [(linear_model.RANSACRegressor(linear_model.LinearRegression())),\
             (linear_model.RANSACRegressor(linear_model.Ridge(alpha=100))), (RandomForestRegressor()),\
             (ensemble.GradientBoostingRegressor()), (ensemble.ExtraTreesRegressor())]
for reg in estimator:
    #X_train, X_test, y_train, y_test = train_test_split(unitformula, vols, random_state = 24)
    reg.fit(X_train, y_train)
    pdv = reg.predict(X_test)
    print "Using: ", reg
    print reg.score(X_test,y_test)
    pdvd = pd.DataFrame(pdv, columns = ["Pred Vol"])
    ytest = pd.DataFrame(y_test)
    ytest = ytest.reset_index(drop=True)
    pdvd["Pred Vol"].div(ytest["Volume"]).hist(bins=50)
    print pdvd["Pred Vol"].div(ytest["Volume"]).describe()
    plt.show()

This loop is the same as the ones seen previously.

### Misc

In [None]:
reg2 = ensemble.ExtraTreesRegressor(n_estimators=100, n_jobs=-1)
reg2.fit(hoffX_test,hoffy_test)

In [None]:
print reg2.score(hoffX_test, hoffy_test)
pdhv = reg2.predict(hoffX_test)

In [None]:
pdhvd = pd.DataFrame(pdhv, columns = ["Pred Vol"])
hoffytest = pd.DataFrame(hoffy_test)

In [None]:
hoffytest.reset_index(drop=True)

In [None]:
pdhvd["Pred Vol"].div(hoffytest["HofmannVols"]).hist(bins=50)
print pdhvd["Pred Vol"].div(hoffytest["HofmannVols"]).describe()
plt.show()

This was an attempt at fitting, predicting and scoring the Hofmann values using TTS; this was an incorrect way to carry the experiment out as the Hofmann values are already known and to score it against itself would yield a score of 1; this needed to be changed to unitformula & vols