In [1]:
import os
import dask
import dask_cudf
import xgboost as xgb
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, train_test_split, KFold, StratifiedKFold
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import dask.dataframe as dd
from dask_ml.metrics import accuracy_score

In [2]:
GPUs = ','.join([str(i) for i in range(0,2)])
os.environ['CUDA_VISIBLE_DEVICES'] = GPUs

In [3]:
def get_cluster():
    cluster = LocalCUDACluster()
    client = Client(cluster)
    return client

In [4]:
client = get_cluster()

2024-02-07 19:27:13,499 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-t79avbm0', purging
2024-02-07 19:27:13,500 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-buijbrtd', purging
2024-02-07 19:27:13,501 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2024-02-07 19:27:13,501 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2024-02-07 19:27:13,521 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2024-02-07 19:27:13,521 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


INFO:distributed.core:Event loop was unresponsive in Nanny for 6.98s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.

INFO:distributed.core:Event loop was unresponsive in Scheduler for 6.99s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.

INFO:distributed.core:Event loop was unresponsive in Nanny for 7.01s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.



In [5]:
X_aras = pd.read_parquet('../data/04_feature/featured_master_table', engine='pyarrow')

In [6]:
y_aras = X_aras['LABEL']
X_aras = X_aras.drop(columns=['LABEL'])

In [7]:
X_aras_train, X_aras_test, y_aras_train, y_aras_test = train_test_split(X_aras, y_aras, test_size = 0.3, random_state = 0, stratify = y_aras)

In [8]:
X_aras_train.shape, X_aras_test.shape, y_aras_train.shape, y_aras_test.shape

((1811203, 31), (776230, 31), (1811203,), (776230,))

## Experiments

### Cross Validation

### 1. Model with XGBoost and encoded activities in one label. No windowing.

In [9]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'learning_rate': hp.uniform ('learning_rate', 0,0.5),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0,
    }

In [10]:
def objective(space):
    
    xgb_parms = { 
        'n_estimators': space['n_estimators'],
        'max_depth': int(space['max_depth']), 
        'gamma': space['gamma'],
        'reg_alpha': int(space['reg_alpha']),
        'min_child_weight': int(space['min_child_weight']),
        'learning_rate': space['learning_rate'], 
        'colsample_bytree': int(space['colsample_bytree']), 
        'eval_metric': 'auc',
        'tree_method':'gpu_hist',
        'predictor':'gpu_predictor',
        'random_state': 0,
        'early_stopping_rounds': 10,
        }
    
    clf = xgb.dask.train(
        client,
        xgb_parms,
        dtrain,
        num_boost_round=100,
        evals=[(dtrain, "train")],
    )
    
    pred = xgb.dask.predict(client, clf, X_test)
    print(f"pred {pred.compute()}")
    return

    accuracy = accuracy_score(y_test.to_dask_array(), pred.to_dask_array())
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [11]:
skf = StratifiedKFold(n_splits=3)
best_hyperparams = []

for i, (train_index, test_index) in enumerate(skf.split(X_aras_train, y_aras_train)):
    print(f"Fold {i}:")

    X_train = X_aras_train.iloc[train_index]
    X_train = dd.from_pandas(X_train, npartitions=2)
    
    y_train = y_aras_train.iloc[train_index]
    y_train = dd.from_pandas(y_train, npartitions=2)
    

    X_test = X_aras_train.iloc[test_index]
    X_test = dd.from_pandas(X_test, npartitions=2)

    y_test = y_aras_train.iloc[test_index]
    y_test = dd.from_pandas(y_test, npartitions=2)

    dtrain = xgb.dask.DaskQuantileDMatrix(client, X_train, y_train)

    trials = Trials()
    best_hyperparams += [fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 100,
                            trials = trials)]
    
    print(f"Params {best_hyperparams[-1]}")
    break

Fold 0:
  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

INFO:distributed.scheduler:Receive client connection: Client-worker-939dd47b-c5e6-11ee-968d-00155d625880

INFO:distributed.core:Starting established connection to tcp://127.0.0.1:51388

INFO:distributed.scheduler:Receive client connection: Client-worker-93a3e561-c5e6-11ee-968b-00155d625880

INFO:distributed.core:Starting established connection to tcp://127.0.0.1:51398



[19:27:40] task [xgboost.dask-0]:tcp://127.0.0.1:40993 got new rank 0
[19:27:40] task [xgboost.dask-1]:tcp://127.0.0.1:44079 got new rank 1
Parameters: { "early_stopping_rounds", "n_estimators" } are not used.

Parameters: { "early_stopping_rounds", "n_estimators" } are not used.











[0]	train-auc:nan                                      
[1]	train-auc:nan                                      
[2]	train-auc:nan                                      
[3]	train-auc:nan                                      
[4]	train-auc:nan                                      
[5]	train-auc:nan                                      
[6]	train-auc:nan                                      
[7]	train-auc:nan                                      
[8]	train-auc:nan                                      
[9]	train-auc:nan                                      
[10]	train-auc:nan                                     
[11]	train-auc:nan                                     
[12]	train-auc:nan                                     
[13]	train-auc:nan                                     
[14]	train-auc:nan                                     
[15]	train-auc:nan                                     
[16]	train-auc:nan                                     
[17]	train-auc:nan                              

pred 4          87.221710                              
6          87.221710
10         86.263725
14         86.263725
16         86.402061
             ...    
2587403    22.512377
2587412    22.512377
2587413    22.512377
2587414    22.512377
2587429    22.512377
Name: 0, Length: 603735, dtype: float32
  0%|          | 0/100 [00:07<?, ?trial/s, best loss=?]

ERROR:hyperopt.fmin:job exception: 'NoneType' object is not iterable



  0%|          | 0/100 [00:07<?, ?trial/s, best loss=?]


TypeError: 'NoneType' object is not iterable

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

### Not used ↓↓↓↓

In [None]:
X_aras_train = X_aras.loc[(
    (X_aras['DAY_26'] != 1) & 
    (X_aras['DAY_27'] != 1) & 
    (X_aras['DAY_28'] != 1) & 
    (X_aras['DAY_29'] != 1) &
    (X_aras['DAY_30'] != 1) 
)]

X_aras_test = X_aras.loc[(
    (X_aras['DAY_26'] == 1) | 
    (X_aras['DAY_27'] == 1) | 
    (X_aras['DAY_28'] == 1) | 
    (X_aras['DAY_29'] == 1) | 
    (X_aras['DAY_30'] == 1) 
)]

In [None]:
y_aras_train = X_aras_train['LABEL']
y_aras_test = X_aras_test['LABEL']


X_aras_train = X_aras_train.drop(columns=['LABEL'])
X_aras_test = X_aras_test.drop(columns=['LABEL'])

In [None]:
X_aras_train.shape, y_aras_train.shape, X_aras_test.shape, y_aras_test.shape

In [None]:
from sklearn.datasets import make_multilabel_classification

In [None]:
X, y = make_multilabel_classification(n_samples=3000, n_features=45, n_classes=20, n_labels=1,
                                      allow_unlabeled=False, random_state=42)

In [None]:
X