In [2]:
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import mean_squared_error, accuracy_score

import numpy as np
from tqdm import trange, tqdm
import flwr as fl
from flwr.common.typing import Parameters
from collections import OrderedDict
from typing import Any, Dict, List, Optional, Tuple, Union
from flwr.common import NDArray, NDArrays
from matplotlib import pyplot as plt 

import torch
from torch.utils.data import DataLoader, Dataset, random_split

In [3]:
def construct_tree(
    # Inital 'dataset' was Dataset of pytorch
    dataset: Dataset, label: NDArray, n_estimators, tree_type, learning_rate=0.1, max_depth=8, subsample=0.8,
    colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, alpha=5, gamma=5, min_child_weight=1
) -> Union[XGBClassifier, XGBRegressor]:
    """Construct a xgboost tree from tabular dataset for multiclass classification."""
    if tree_type == "MULTICLASS":
        tree = XGBClassifier(
            objective="multi:softprob",
            num_class=len(np.unique(label)),  # Number of unique classes in the label
            learning_rate=learning_rate,
            max_depth=max_depth,
            n_estimators=n_estimators,
            subsample=subsample,
            colsample_bylevel=colsample_bylevel,
            colsample_bynode=colsample_bynode,
            colsample_bytree=colsample_bytree,
            alpha=alpha,
            gamma=gamma,
            num_parallel_tree=1,
            min_child_weight=min_child_weight,
        )

    elif tree_type == "REG":
        tree = xgb.XGBRegressor(
            objective="reg:squarederror",
            learning_rate=0.1,
            max_depth=8,
            n_estimators=n_estimators,
            subsample=0.8,
            colsample_bylevel=1,
            colsample_bynode=1,
            colsample_bytree=1,
            alpha=5,
            gamma=5,
            num_parallel_tree=1,
            min_child_weight=1,
        )

    tree.fit(dataset, label)
    return tree

def construct_tree_from_loader(
    dataset_loader: DataLoader, n_estimators: int, tree_type: str
) -> Union[XGBClassifier, XGBRegressor]:
    """Construct a xgboost tree form tabular dataset loader."""
    for dataset in dataset_loader:
        data, label = dataset[0], dataset[1]
    return construct_tree(data, label, n_estimators, tree_type)


def single_tree_prediction(
    tree: Union[XGBClassifier, XGBRegressor], n_tree: int, dataset: NDArray
) -> Optional[NDArray]:
    """Extract the prediction result of a single tree in the xgboost tree
    ensemble."""
    # How to access a single tree
    # https://github.com/bmreiniger/datascience.stackexchange/blob/master/57905.ipynb
    num_t = len(tree.get_booster().get_dump())
    if n_tree > num_t:
        print(
            "The tree index to be extracted is larger than the total number of trees."
        )
        return None

    return tree.predict(  # type: ignore
        dataset, iteration_range=(n_tree, n_tree + 1), output_margin=True
    )


def tree_encoding(  # pylint: disable=R0914
    trainloader: DataLoader,
    client_trees: Union[
        Tuple[XGBClassifier, int],
        Tuple[XGBRegressor, int],
        List[Union[Tuple[XGBClassifier, int], Tuple[XGBRegressor, int]]],
    ],
    client_tree_num: int,
    client_num: int,
) -> Optional[Tuple[NDArray, NDArray]]:
    """Transform the tabular dataset into prediction results using the
    aggregated xgboost tree ensembles from all clients."""
    if trainloader is None:
        return None

    for local_dataset in trainloader:
        x_train, y_train = local_dataset[0], local_dataset[1]

    x_train_enc = np.zeros((x_train.shape[0], client_num * client_tree_num))
    x_train_enc = np.array(x_train_enc, copy=True)

    temp_trees: Any = None
    if isinstance(client_trees, list) is False:
        temp_trees = [client_trees[0]] * client_num
    elif isinstance(client_trees, list) and len(client_trees) != client_num:
        temp_trees = [client_trees[0][0]] * client_num
    else:
        cids = []
        temp_trees = []
        for i, _ in enumerate(client_trees):
            temp_trees.append(client_trees[i][0])  # type: ignore
            cids.append(client_trees[i][1])  # type: ignore
        sorted_index = np.argsort(np.asarray(cids))
        temp_trees = np.asarray(temp_trees)[sorted_index]

    for i, _ in enumerate(temp_trees):
        for j in range(client_tree_num):
            x_train_enc[:, i * client_tree_num + j] = single_tree_prediction(
                temp_trees[i], j, x_train
            )

    x_train_enc32: Any = np.float32(x_train_enc)
    y_train32: Any = np.float32(y_train)

    x_train_enc32, y_train32 = torch.from_numpy(
        np.expand_dims(x_train_enc32, axis=1)  # type: ignore
    ), torch.from_numpy(
        np.expand_dims(y_train32, axis=-1)  # type: ignore
    )
    return x_train_enc32, y_train32

In [13]:
import pandas as pd
from preprocessing import preprocess

train_csv = f'/home/dnlab/Data-B/my_research/Geoscience_FL/data_well_log/cl_data/client_1_train.csv'
test_csv = f'/home/dnlab/Data-B/my_research/Geoscience_FL/data_well_log/cl_data/client_1_test.csv'


train_data = pd.read_csv(train_csv)
test_data = pd.read_csv(test_csv)

lithology_train = train_data['FORCE_2020_LITHOFACIES_LITHOLOGY']
lithology_test = test_data['FORCE_2020_LITHOFACIES_LITHOLOGY']

lithology_numbers = {30000: 0,
                        65030: 1,
                        65000: 2,
                        80000: 3,
                        74000: 4,
                        70000: 5,
                        70032: 6,
                        88000: 7,
                        86000: 8,
                        99000: 9,
                        90000: 10,
                        93000: 11}

lithology_train = lithology_train.map(lithology_numbers)
lithology_test = lithology_test.map(lithology_numbers)

# preprocess was changed
train_dataset = preprocess(train_data)
test_dataset = preprocess(test_data)

Shape of concatenated dataframe before dropping columns: (635461, 29)
Shape of dataframe after dropping columns: (635461, 23)
Shape of dataframe after label encoding columns: (635461, 26)
Dataframe columns: Index(['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'RSHA', 'RMED', 'RDEP',
       'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DCAL', 'DRHO',
       'MUDWEIGHT', 'RMIC', 'GROUP_encoded', 'FORMATION_encoded',
       'WELL_encoded'],
      dtype='object')
Shape of the dataset BEFORE augmentation: (635461, 23)
Shape of the dataset AFTER augmentation: (635461, 92)
Shape of concatenated dataframe before dropping columns: (69333, 28)
Shape of dataframe after dropping columns: (69333, 23)
Shape of dataframe after label encoding columns: (69333, 26)
Dataframe columns: Index(['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'RSHA', 'RMED', 'RDEP',
       'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DCAL', 'DRHO',
       'MUDWEIGHT', 'RMIC', 'GROUP_encoded', 'FORMATION_

In [14]:
print(type(train_dataset))
print(type(lithology_train))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [16]:
train_labels = lithology_train.values
test_labels = lithology_test.values
print(type(train_labels))
print(type(test_labels))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [17]:
print("Feature dimension of the dataset:", train_dataset.shape[1])
print("Size of the trainset:", train_dataset.shape[0])
print("Size of the testset:", test_dataset.shape[0])

Feature dimension of the dataset: 92
Size of the trainset: 635461
Size of the testset: 69333


In [18]:
class TreeDataset(Dataset):
    def __init__(self, data: NDArray, labels: NDArray) -> None:
        self.labels = labels
        self.data = data

    def __len__(self) -> int:
        return len(self.labels)

    def __getitem__(self, idx: int) -> Dict[int, NDArray]:
        label = self.labels[idx]
        data = self.data[idx, :]
        sample = {0: data, 1: label}
        return sample

In [None]:
trainset = TreeDataset(np.array(train_dataset, copy=True), np.array(train_labels, copy=True))
testset = TreeDataset(np.array(test_dataset, copy=True), np.array(test_labels, copy=True))