In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

import pandas as pd
import numpy as np
from wine_analysis_hplc_uv import definitions
import seaborn as sns
import matplotlib.pyplot as plt

from wine_analysis_hplc_uv import definitions

idx = pd.IndexSlice

In [None]:
data = pd.read_parquet(definitions.RW_CUP_450_PROCESSED)
data

In [None]:
# reduce dataset to aligned signals/ref with samplecode labels and transpose

d = (
    data.loc[:, idx[:, :, ["aligned", "ref"]]]
    .droplevel(["wine", "state"], axis=1)
    .reset_index(drop=True)
    .T
)
display(d)

In [None]:
# replace alphabetic samplecode values with float codes

alpha_samplecodes = [
    "crawford-cab",
    "mt-diff-bannockburn-pn",
    "st hugo gsm",
    "stoney-rise-pn",
    "torbreck-struie",
]

float_codes = [
    1111,
    2222,
    3333,
    4444,
    5555,
]

code_dict =  dict(zip(alpha_samplecodes, float_codes))
display(code_dict)

d.index = d.index.to_series().replace(code_dict).astype(int)
d.index

In [None]:
y = d.index.values
X = d.values
y

In [None]:
"""
sklearn.model_selection.train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)

train_test_split wraps `next(ShuffleSplit().split(X, y))` to produce validated, randomized and proportioned training and test sets from an input.

`arrays`: sequence of indexables with same length/`shape[0]`, i.e. rows. Can accept lists, np.arrays, scipy sparse matrices, pd.DataFrames.

`test_size`: `float` or `int`, default=`None`. Size of test set. Integer input translates as absolute number of samples in the test set, float between 0 and 1 translates as a proportion. If set to `None`, test_size will be the remainder of the samples after proessing `train_size`. If `train_size` also `None`, defaults to 0.25.

`training_size`: `float` or `int`, default=`None`. Size of training set. Integer input translates as absolute number of samples in the training set, float between 0 and 1 translates as a proportion. If set to `None`, test_size will be the remainder of the samples after proessing `test_size`. If `test_size` also `None`, will default to 0.75.

`random_state`: `int`, `RandomState instance`, `None`, default=`None`. Seed for the randomized shuffling. Providing a value will enable reproducible results.

shuffle: `bool`, default=None. (?) `None` results in a stratified split. Stratified folds result in sets with the same proportionate representation per fold across classes in the data.

Returns a list of length 2*len(arrays)

Source: [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
"""

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_train

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_test = le.fit_transform(y_test)
y_test


In [None]:
# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(tree_method="hist")
# Fit the model, test sets are used for early stopping.
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])

In [None]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))