In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from wine_analysis_hplc_uv import definitions

idx = pd.IndexSlice

In [None]:
data = pd.read_parquet(definitions.RW_CUP_450_PROCESSED)
data

In [None]:
def classify_sample(data):
    # reduce dataset to aligned signals/ref with samplecode labels and transpose

    d = (
        data.loc[:, idx[:, :, ["aligned", "ref"]]]
        .droplevel(["wine", "state"], axis=1)
        .reset_index(drop=True)
        .T
    )

    # replace alphabetic samplecode values with float codes

    alpha_samplecodes = [
        "crawford-cab",
        "mt-diff-bannockburn-pn",
        "st hugo gsm",
        "stoney-rise-pn",
        "torbreck-struie",
    ]

    float_codes = [
        1111,
        2222,
        3333,
        4444,
        5555,
    ]

    code_dict = dict(zip(alpha_samplecodes, float_codes))
    display(code_dict)

    d.index = d.index.to_series().replace(code_dict).astype(int)

    y = d.index.values
    X = d.values

    """
    sklearn.model_selection.train_test_split(*arrays, test_size=none, train_size=none, random_state=none, shuffle=true, stratify=none)

    train_test_split wraps `next(shufflesplit().split(x, y))` to produce validated, randomized and proportioned training and test sets from an input.

    `arrays`: sequence of indexables with same length/`shape[0]`, i.e. rows. can accept lists, np.arrays, scipy sparse matrices, pd.dataframes.

    `test_size`: `float` or `int`, default=`none`. size of test set. integer input translates as absolute number of samples in the test set, float between 0 and 1 translates as a proportion. if set to `none`, test_size will be the remainder of the samples after proessing `train_size`. if `train_size` also `none`, defaults to 0.25.

    `training_size`: `float` or `int`, default=`none`. size of training set. integer input translates as absolute number of samples in the training set, float between 0 and 1 translates as a proportion. if set to `none`, test_size will be the remainder of the samples after proessing `test_size`. if `test_size` also `none`, will default to 0.75.

    `random_state`: `int`, `randomstate instance`, `none`, default=`none`. seed for the randomized shuffling. providing a value will enable reproducible results.

    shuffle: `bool`, default=none. (?) `none` results in a stratified split. stratified folds result in sets with the same proportionate representation per fold across classes in the data.

    stratify: `array-like`, default=None: stratify split with the `array-like` as the class labels.

    returns a list of length 2*len(arrays)

    source: [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
    """

    x_train, x_test, y_train, y_test = train_test_split(X, y)

    le = LabelEncoder()

    y_test = le.fit_transform(y_test)
    y_train = le.fit_transform(y_train)

    display(y_train, y_test)

    """
    XGBoost adaption of sklearn API.
    
    Docs: [link](https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn)
    User guide: [link](https://xgboost.readthedocs.io/en/stable/python/sklearn_estimator.html)
    
    Parameters:
    `n_estimators: `int`. number of boosting rounds. 
    """
    clf = xgb.XGBClassifier(tree_method="hist")

    # Fit the model, test sets are used for early stopping.
    """
    `eval_set` is used internally to calculate validation metrics
    X: Feature matrix
    """
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    display(help(clf.fit))
    y_pred = clf.predict(X_test)

    print(
        "XGBoost model accuracy score: {0:0.4f}".format(accuracy_score(y_test, y_pred))
    )

    return


classify_sample(data)