### Data Preparation

In [11]:
#import programmes I need
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [15]:
training_data = pd.read_csv('data_for_training.csv')

def reindex_data(data):
    data.dropna(inplace = True)
    data["labels"] = data["Leg"].astype(str) + "-" + data["Site"].astype(str) + data["H"].astype(str) \
       + "-" + [f"{core:03d}" for core in data["Cor"]]+ data["T"].astype(str) + "-" + data["Sc"].astype(str) + "-" + (data["Top(cm)"]/100).astype(str)
    data = data.groupby(data["labels"],as_index=True).mean() #(group repeated indexes by averagoing them))
    data.drop(['Leg', 'Site', 'Cor', 'Top(cm)'], inplace=True, axis=1)
    return(data)

reindex_training_data = reindex_data(training_data) 



In [16]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(reindex_training_data, test_size=0.2)

final_train_data = train_data.drop(['CaCO3 (wt %)'], axis = 1)
final_train_labels = train_data['CaCO3 (wt %)'].copy()

final_test_data = test_data.drop(['CaCO3 (wt %)'], axis=1) # drop labels for training set
final_test_labels = test_data['CaCO3 (wt %)'].copy()

In [17]:
depth_ix, Corr_ix, L_ix, a_ix, b_ix = [
    list(final_train_data.columns).index(col)
    for col in ("Depth (mbsf)",  "Corr. Counts", "L*", "a*", "b*")]

In [21]:
def add_polar_colors(data):
    color_angle = np.arctan(data[:,b_ix] / (data[:,a_ix]+1E-5))
    color_distance = data[:,a_ix]/np.cos(color_angle)
    return np.c_[data, color_angle, color_distance]

In [26]:
#we now want to create a new pipeline to use add_polar coordinates to any dataframe
from sklearn.preprocessing import FunctionTransformer

attr_adder = FunctionTransformer(add_polar_colors)

# Now let's create a new, final dataframe called 'data_extra_attribs':
data_extra_attribs = attr_adder.fit_transform(final_train_data.values)

# Now let's make a new dataframe to print the data nicely

data_extra_attribs = pd.DataFrame(
    data_extra_attribs,
    columns=list(final_train_data.columns)+["color_angle", "color_distance"],
    index=final_train_data.index)
data_extra_attribs

Unnamed: 0_level_0,Depth (mbsf),Corr. Counts,Density (g/cc),L*,a*,b*,color_angle,color_distance
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
194-1193A-047X-1-0.75,253.85,16.29,1.671,53.91,0.03,10.21,1.567857,10.206642
194-1195B-045X-5-0.64,426.14,16.69,2.012,53.79,-1.27,8.02,-1.413747,-8.119995
194-1195A-002H-5-0.64,11.34,26.09,1.687,57.66,0.04,13.36,1.567802,13.356721
194-1193A-056X-CC-0.17,325.37,14.49,1.936,55.95,0.13,9.99,1.557783,9.990077
194-1193A-041X-CC-0.02,222.54,7.49,1.953,54.95,-0.53,4.64,-1.457067,-4.670258
...,...,...,...,...,...,...,...,...
194-1192A-007H-5-0.74,55.24,9.49,1.685,53.34,-0.30,11.50,-1.544716,-11.504296
194-1197A-004H-4-0.62,30.22,6.89,1.627,36.27,0.36,5.66,1.507276,5.671280
194-1193A-044X-3-0.1,237.10,14.89,1.824,46.67,-1.34,4.30,-1.268708,-4.503984
194-1194A-006H-4-0.7,47.90,14.09,1.677,50.94,-0.70,7.39,-1.476357,-7.423184


In [27]:
from sklearn.preprocessing import FunctionTransformer

attr_adder = FunctionTransformer(add_polar_colors)

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler #Normailsation scaling

prep_pipeline = Pipeline([
        ('attribs_adder', FunctionTransformer(add_polar_colors)),
        ('minmax_scaler', MinMaxScaler())
    ])

training_prep = prep_pipeline.fit_transform(final_train_data.values)
training_prep


array([[0.14705539, 0.17174609, 0.76234683, ..., 0.35231194, 0.99906415,
        0.55754131],
       [0.24773559, 0.17848123, 0.88897141, ..., 0.27674258, 0.04937922,
        0.14119221],
       [0.00534109, 0.33675703, 0.76828815, ..., 0.46100759, 0.99904648,
        0.62910558],
       ...,
       [0.13726728, 0.14817309, 0.81916079, ..., 0.14837819, 0.09557635,
        0.22334163],
       [0.02670547, 0.13470281, 0.76457482, ..., 0.25500345, 0.02943722,
        0.15702252],
       [0.09632084, 0.34685974, 0.77720015, ..., 0.11594203, 0.12751832,
        0.24270483]])

In [29]:
test_prep = prep_pipeline.fit_transform(final_test_data.values)
test_prep

array([[1.32473801e-01, 6.61753647e-02, 8.41011007e-01, 8.32227054e-01,
        9.38582677e-01, 6.96493982e-01, 9.29980457e-01, 8.26309192e-01],
       [9.48949406e-01, 6.02947812e-01, 9.09906237e-01, 1.13984894e-01,
        4.48818898e-01, 5.23286238e-04, 3.82126740e-01, 4.10049877e-01],
       [9.27697363e-02, 1.23326816e-01, 8.46718304e-01, 5.76791028e-01,
        3.44881890e-01, 3.76766091e-01, 3.30459330e-02, 1.94844483e-01],
       [1.62090761e-01, 6.91833358e-02, 6.58377497e-01, 8.33142596e-02,
        4.99212598e-01, 1.42857143e-01, 9.78164756e-01, 4.97883397e-01],
       [9.57823950e-01, 8.48097458e-01, 9.51080310e-01, 0.00000000e+00,
        4.31496063e-01, 5.80847724e-02, 6.39873737e-02, 3.78854426e-01],
       [9.53705927e-02, 9.92630471e-02, 8.63024868e-01, 7.96520943e-01,
        4.42519685e-01, 4.86656201e-01, 4.19696036e-03, 1.32816421e-01],
       [3.36061108e-02, 1.29342758e-01, 8.78108439e-01, 7.89425498e-01,
        3.05511811e-01, 4.54212454e-01, 3.62403707e-02, 1.

In [31]:
#we now need to save our data
np.savez('sess4_arr_2', training_prep=training_prep,test_prep=test_prep, train_labels=final_train_labels.values,test_labels=final_test_labels.values)


# the 'to_pickle' method on the dataframe takes a full path name for the file. Here we will save it in the same folder, under the name 'sess4_df' with
# a pickle extension ('.pkl')
data_extra_attribs.to_pickle('sess4_df_2.pkl')