### Import libraries and data

In [None]:
import pandas as pd
import csv
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
vols = pd.read_csv("volumes300.csv")
vols = vols.drop("Unnamed: 0", axis=1)

In [None]:
temps = pd.read_csv("temperatures300.csv")
temps = temps.drop("Unnamed: 0", axis=1)

In [None]:
unitformula = pd.read_csv("scaledformulae300.csv")
unitformula = unitformula.drop("Unnamed: 0", axis=1)

In [None]:
hoffX = pd.read_csv("hofmanndataframe.csv")
hoffX = hoffX.drop("Unnamed: 0", axis=1)
hoffy = pd.read_csv("hofmannvols.csv")
hoffy = hoffy.drop("Unnamed: 0", axis=1)

### Setting models

In [None]:
from sklearn import linear_model, svm, kernel_ridge, ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from bokeh.plotting import figure, show, output_file

In [None]:
X_train, X_test, y_train, y_test = train_test_split(unitformula, vols, random_state=24)

### Comparison methods functions

In [None]:
def getcosine(estimator, X, y):
    pred = estimator.predict(X)
    cossimi = 1 - distance.cosine(pred,y)
    return cossimi
def getbray(estimator, X, y):
    pred = estimator.predict(X)
    bray = 1 - distance.braycurtis(pred, y)
    return bray
def getcanb(estimator, X, y):
    pred = estimator.predict(X)
    canb = distance.canberra(pred, y)
    return canb
def getcorr(estimator, X, y):
    pred = estimator.predict(X)
    corr = distance.correlation(pred, y)
    return corr

The comparison methods seen previously have been cleaned up and made into separate functions so they can be called from each loop below.

### Calculations

In [None]:
reg2 = linear_model.LinearRegression(fit_intercept=False)
reg2.fit(hoffX,hoffy)
hoffpred = reg2.predict(unitformula)

The previous mistake has been resolved; now the Hofmann values are read and fit to a simple linear model so that each element is given its predetermined volume value.
Thereafter a prediction of the volumes is made by using the predict function, and predicting based off the unit formulas.

In [None]:
hoffpredDF = pd.DataFrame(hoffpred)
hoffpredDF.hist(bins=1000)
axes = plt.gca()
axes.set_xlim([0,30000])
plt.ylabel("Number of entries")
plt.xlabel("Volume")
#plt.set_title("Volume of entries")
plt.show()

The predictions can then be graphically seen in a histogram, to see which range of volumes were the most predicted, the max etc.

In [None]:
print reg2.score(unitformula,vols)

The Hofmann prediction is scored against the actual volumes and this result is seen to be poor.

In [None]:
hoffCV= cross_val_score(reg2,hoffX,hoffy, cv=5)
print hoffCV.mean(), hoffCV.std()

Here it is scored again but using cross validation (5 times).

In [None]:
hoffcvcosine = cross_val_score(reg2, unitformula, vols, cv=5, scoring=getcosine)
hoffcvbray = cross_val_score(reg2, unitformula, vols, cv=5, scoring=getbray)
hoffcvcanb = cross_val_score(reg2, unitformula, vols, cv=5, scoring=getcanb)
hoffcvcorr = cross_val_score(reg2, unitformula, vols, cv=5, scoring=getcorr)
print hoffcvcosine.mean() , hoffcvcosine.std(),hoffcvcosine
print "\n", hoffcvbray.mean(), hoffcvbray.std(), hoffcvbray
print "\n", hoffcvcanb.mean(), hoffcvcanb.std(),hoffcvcanb
print "\n", hoffcvcorr.mean(), hoffcvcorr.std(), hoffcvcorr

The similarity of the Hofmann predictions vs. the actual volumes is calculated using each of the functions above; these results are cross validated 5 times and the mean score and standard deviations are outputted

In [None]:
regHub = linear_model.HuberRegressor(alpha=0.0005)
hubcosine = cross_val_score(regHub, unitformula, vols, cv=5, scoring=getcosine)
hubbray = cross_val_score(regHub, unitformula, vols, cv=5, scoring=getbray)
hubcanb = cross_val_score(regHub, unitformula, vols, cv=5, scoring=getcanb)
hubcorr = cross_val_score(regHub, unitformula, vols, cv=5, scoring=getcorr)

#print "\nMy Mean: ", cvscore.mean(), "STD: ", cvscore.std()
print "\nCosine mean: ", hubcosine.mean(), "Standard deviation: ", hubcosine.std(), hubcosine
print "\nBray Similarity: ", hubbray.mean(), "Standard deviation: ", hubbray.std(), hubbray
print "\n Canberra Similarity: ", hubcanb.mean(), "Standard deviation: ", hubcanb.std(), hubcanb
print "\n Correlation: ", hubcorr.mean(), "Standard deviation: ", hubcorr.std(), hubcorr

Here the same is being employed for the Huber regressor

In [None]:
estim = [(linear_model.RANSACRegressor(linear_model.LinearRegression())),\
          (linear_model.RANSACRegressor(linear_model.Ridge(alpha=100))), (RandomForestRegressor(n_estimators=100, n_jobs=-1)),\
          (ensemble.GradientBoostingRegressor(n_estimators=100)), (ensemble.ExtraTreesRegressor(n_estimators=100, n_jobs=-1))]

for reg in estim:
    cvscore = cross_val_score(reg, unitformula, vols)
    cvcosine = cross_val_score(reg, unitformula, vols, cv=5, scoring=getcosine)
    cvbray = cross_val_score(reg, unitformula, vols, cv=5, scoring=getbray)
    cvcanb = cross_val_score(reg, unitformula, vols, cv=5, scoring=getcanb)
    cvcorr = cross_val_score(reg, unitformula, vols, cv=5, scoring=getcorr)
    #pred = cross_val_predict(reg, unitformula, vols)

    print "\nMy Mean: ", cvscore.mean(), "STD: ", cvscore.std()
    print "\nCosine mean: ", cvcosine.mean(), "Standard deviation: ", cvcosine.std(), cvcosine
    print "\nBray Similarity: ", cvbray.mean(), "Standard deviation: ", cvbray.std(), cvbray
    print "\n Canberra Similarity: ", cvcanb.mean(), "Standard deviation: ", cvcanb.std(), cvcanb
    print "\n Correlation: ", cvcorr.mean(), "Standard deviation: ", cvcorr.std(), cvcorr

This is a loop to perform the same calculations but over a range of estimators.

### Graphs

In [None]:
estim = [(linear_model.RANSACRegressor(linear_model.LinearRegression())),\
          (linear_model.RANSACRegressor(linear_model.Ridge(alpha=100))), (RandomForestRegressor(n_estimators=100, n_jobs=-1)),\
          (ensemble.GradientBoostingRegressor(n_estimators=100)), (ensemble.ExtraTreesRegressor(n_estimators=100, n_jobs=-1))]

from scipy import spatial
from scipy.spatial import distance

for reg in estim:
    print "\nUsing: ", reg
    cvscore = cross_val_score(reg, unitformula, vols)
    #hofcvscore = cross_val_score(reg, hoffX, hoffy)
    pred = cross_val_predict(reg, unitformula, vols)
    
        
    predDF = pd.DataFrame(pred, columns = ["Pred Vol"])
    predDF["Pred Vol"].div(vols["Volume"]).hist(bins=50)
    print predDF["Pred Vol"].div(vols["Volume"]).describe()
    plt.show()

This loop is just to create histograms based off of the ratio between the predictions made by the estimators and the actual volumes in the CSD.

In [None]:
hoffDF = pd.DataFrame(hoffpred, columns = ["Hoff Vol"])
hoffDF["Hoff Vol"].div(vols["Volume"]).hist(bins=50)
print hoffDF["Hoff Vol"].div(vols["Volume"]).describe()
plt.show()
plt.title("Hoffman & CSD Volumes")
plt.xlabel("Volume ratio")
plt.ylabel("Count")
##HOFFMAN VS OG VOL; problems with seaborn plots

This is the same idea - creating graphs but this is for the ratio between the Hofmann predicted volumes & actual volumes from CSD.

In [None]:
estim = [(linear_model.RANSACRegressor(linear_model.LinearRegression())),\
          (linear_model.RANSACRegressor(linear_model.Ridge(alpha=100))), (RandomForestRegressor(n_estimators=100, n_jobs=-1)),\
          (ensemble.GradientBoostingRegressor(n_estimators=100)), (ensemble.ExtraTreesRegressor(n_estimators=100, n_jobs=-1))]
for reg in estim:
    print "\nUsing: ", reg
    pred = cross_val_predict(reg, unitformula, vols)
    predDF = pd.DataFrame(pred, columns = ["Pred Vol"])
    hoffDF["Hoff Vol"].div(predDF["Pred Vol"]).hist(bins=50)
    print hoffDF["Hoff Vol"].div(predDF["Pred Vol"]).describe()
    plt.show()
    plt.title("Hoffman & Huseyin")

Again creating a graph, this time for the ratio between the Hofmann predicted volumes & volumes predicted from each regressor. This is a separate loop just so the graphs aren't confusing to view