In [1]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score,balanced_accuracy_score
from sklearn.utils import shuffle

from plotly import express as px

#from UA_MDM_LDI_II.tutoriales.utils import plot_confusion_matrix
from utils import plot_confusion_matrix

import os

import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

from joblib import load, dump


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Paths
BASE_DIR = '../../'
PATH_TO_TRAIN = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/train/train.csv")
PATH_TO_MODELS = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/models")
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/optuna_temp_artifacts")
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/optuna_artifacts")


SEED = 42
BATCH_SIZE = 50
TEST_SIZE = 0.2

In [3]:
# Datos Tabulares
dataset = pd.read_csv(PATH_TO_TRAIN)

In [4]:
dataset.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')

In [5]:
train, test = train_test_split(dataset,
                               test_size = TEST_SIZE,
                               random_state = SEED,
                               stratify = dataset.AdoptionSpeed)

In [6]:
char_feats = [f for f in dataset.columns if dataset[f].dtype=='O']
numeric_feats = [f for f in dataset.columns if dataset[f].dtype!='O']

In [7]:
features = ['Type',
 'Age',
 'Breed1',
 'Breed2',
 'Gender',
 'Color1',
 'Color2',
 'Color3',
 'MaturitySize',
 'FurLength',
 'Vaccinated',
 'Dewormed',
 'Sterilized',
 'Health',
 'Quantity',
 'Fee',
 'State',
 'VideoAmt',
 'PhotoAmt']

label = 'AdoptionSpeed'

In [8]:
X_train = train[features]
y_train = train[label]

X_test = test[features]
y_test = test[label]

In [9]:
y_train.unique()

array([2, 3, 4, 1, 0], dtype=int64)

In [10]:
lgb_params = params = {
                        'objective': 'multiclass',
                        'num_class': len(y_train.unique())
                        }


lgb_train_dataset = lgb.Dataset(data=X_train,
                                label=y_train)


lgb_model = lgb.train(lgb_params,
                      lgb_train_dataset)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001838 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 438
[LightGBM] [Info] Number of data points in the train set: 11994, number of used features: 19
[LightGBM] [Info] Start training from score -3.599148
[LightGBM] [Info] Start training from score -1.579379
[LightGBM] [Info] Start training from score -1.311924
[LightGBM] [Info] Start training from score -1.526206
[LightGBM] [Info] Start training from score -1.273359


In [11]:
y_pred = lgb_model.predict(X_test).argmax(axis=1)

cohen_kappa_score(y_test,y_pred, weights = 'quadratic')

0.31327315052086113

In [12]:
display(plot_confusion_matrix(y_test,y_pred))

In [13]:
cohen_kappa_score(y_test,y_test, weights = 'quadratic')

1.0

In [14]:
display(plot_confusion_matrix(y_test,y_test))

In [15]:

y_shuffled = shuffle(y_test,
                     random_state = 42)


dict_map_cerca = {0:1,
                  1:2,
                  2:3,
                  3:4,
                  4:3}

dict_map_lejos = {0:4,
                  1:4,
                  2:0,
                  3:0,
                  4:0}

y_cerca = [dict_map_cerca[i] for i in y_test]

y_lejos = [dict_map_lejos[i] for i in y_test]


In [16]:
random_list =  np.random.rand(len(y_test))

kappa_progression = pd.DataFrame()

for i in range(101):

    y_simulado = [y_test.iloc[sample] if random_list[sample]<i/100 else y_shuffled.iloc[sample] for sample in range(len(y_test))]

    y_simulado_cerca = [y_test.iloc[sample] if random_list[sample]<i/100 else y_cerca[sample] for sample in range(len(y_test))]

    y_simulado_lejos = [y_test.iloc[sample] if random_list[sample]<i/100 else y_lejos[sample] for sample in range(len(y_test))]


    kappa_progression = pd.concat([kappa_progression,
                                   pd.DataFrame({'Conocidos':[i],
                                                'kappa':cohen_kappa_score(y_test,
                                                                        y_simulado,
                                                                        weights = 'quadratic'),
                                                'kappa_cerca':cohen_kappa_score(y_test,
                                                                        y_simulado_cerca,
                                                                        weights = 'quadratic'),
                                                'kappa_lejos':cohen_kappa_score(y_test,
                                                                        y_simulado_lejos,
                                                                        weights = 'quadratic'),                                                                        
                                                'accuracy':accuracy_score(y_test,
                                                                        y_simulado),
                                                'balanced_accuracy':balanced_accuracy_score(y_test,
                                                                        y_simulado),
                                                                        })],
                ignore_index=True)

In [17]:
px.line(kappa_progression,x='Conocidos',y=['kappa',
                                           'kappa_cerca',
                                           'kappa_lejos',
                                           'accuracy',
                                           'balanced_accuracy'])

In [18]:
y_simulado_cerca = [y_test.iloc[sample] if random_list[sample]<50/100 else y_cerca[sample] for sample in range(len(y_test))]

display(plot_confusion_matrix(y_test,y_simulado_cerca, 
                              title = "Kappa " + str(cohen_kappa_score(y_test,y_simulado_cerca, weights = 'quadratic'))))


y_simulado_lejos = [y_test.iloc[sample] if random_list[sample]<50/100 else y_lejos[sample] for sample in range(len(y_test))]

display(plot_confusion_matrix(y_test,y_simulado_lejos, 
                              title = "Kappa " + str(cohen_kappa_score(y_test,y_simulado_lejos, weights = 'quadratic'))))


In [19]:
lgb_params = params = {
                        'objective': 'multiclassova',
                        'num_class': len(y_train.unique())
                        }


lgb_train_dataset = lgb.Dataset(data=X_train,
                                label=y_train)


lgb_model = lgb.train(lgb_params,
                      lgb_train_dataset)

[LightGBM] [Info] Number of positive: 328, number of negative: 11666
[LightGBM] [Info] Number of positive: 2472, number of negative: 9522
[LightGBM] [Info] Number of positive: 3230, number of negative: 8764
[LightGBM] [Info] Number of positive: 2607, number of negative: 9387
[LightGBM] [Info] Number of positive: 3357, number of negative: 8637
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 438
[LightGBM] [Info] Number of data points in the train set: 11994, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027347 -> initscore=-3.571420
[LightGBM] [Info] Start training from score -3.571420
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206103 -> initscore=-1.348577
[LightGBM] [Info] Start training from score -1.348577
[LightGBM] [Info] [binary:BoostFrom

In [20]:

y_pred = lgb_model.predict(X_test).argmax(axis=1)

display(plot_confusion_matrix(y_test,y_pred))

{'kappa':cohen_kappa_score(y_test,
                y_pred,
                weights = 'quadratic'),
 'accuracy':accuracy_score(y_test,y_pred),
 'balanced_accuracy':balanced_accuracy_score(y_test,y_pred)}




{'kappa': 0.33093403829656054,
 'accuracy': 0.3914638212737579,
 'balanced_accuracy': 0.32177012028422647}

In [21]:
def lgb_objective(trial):
    lgb_params = {      
                        'objective': 'multiclass',
                        'verbosity':-1,
                        'num_class': len(y_train.unique()),
                        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
                        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
                        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
                        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                        } 


    lgb_train_dataset = lgb.Dataset(data=X_train,
                                    label=y_train)


    lgb_model = lgb.train(lgb_params,
                        lgb_train_dataset)
    
    return(cohen_kappa_score(y_test,lgb_model.predict(X_test).argmax(axis=1),
                             weights = 'quadratic'))

In [22]:
study = optuna.create_study(direction='maximize',
                            storage="sqlite:///db.sqlite3",  # Specify the storage URL here.
                            study_name="04 - LGB Multiclass",
                            load_if_exists=True)
study.optimize(lgb_objective, n_trials=100)

[I 2024-06-19 18:21:08,668] Using an existing study with name '04 - LGB Multiclass' instead of creating a new one.
[I 2024-06-19 18:21:11,093] Trial 100 finished with value: 0.31099857740724135 and parameters: {'lambda_l1': 0.7601403812211934, 'lambda_l2': 1.2664669824477846, 'num_leaves': 221, 'feature_fraction': 0.7252241416684759, 'bagging_fraction': 0.4535838845654929, 'bagging_freq': 1, 'min_child_samples': 80}. Best is trial 14 with value: 0.3285398292984908.
[I 2024-06-19 18:21:12,392] Trial 101 finished with value: 0.31244760951258477 and parameters: {'lambda_l1': 4.773483668150672, 'lambda_l2': 0.34465271339556247, 'num_leaves': 183, 'feature_fraction': 0.5375869277025375, 'bagging_fraction': 0.4269807627868867, 'bagging_freq': 2, 'min_child_samples': 97}. Best is trial 14 with value: 0.3285398292984908.
[I 2024-06-19 18:21:15,398] Trial 102 finished with value: 0.3076220466428642 and parameters: {'lambda_l1': 1.932933283868399e-05, 'lambda_l2': 0.10902085197660201, 'num_leave

In [23]:
lgb_params =  {      
                        'objective': 'multiclass',
                        'verbosity':-1,
                        'num_class': len(y_train.unique())} | study.best_params

lgb_train_dataset = lgb.Dataset(data=X_train,
                                label=y_train)


lgb_model = lgb.train(lgb_params,
                    lgb_train_dataset)

display(plot_confusion_matrix(y_test,lgb_model.predict(X_test).argmax(axis=1)))


In [24]:
def lgb_custom_metric_kappa(dy_pred, dy_true):

    metric_name = 'kappa'
    value = cohen_kappa_score(dy_true.get_label(),dy_pred.argmax(axis=1),weights = 'quadratic')
    is_higher_better = True
    return(metric_name, value, is_higher_better)

def cv_es_lgb_objective(trial):

    lgb_params = {      
                        'objective': 'multiclass',
                        'verbosity':-1,
                        'num_class': len(y_train.unique()),
                        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
                        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
                        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
                        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                        } 

    scores_ensemble = np.zeros((len(y_test),len(y_train.unique())))
    score_folds = 0
    n_splits = 5


    skf = StratifiedKFold(n_splits=n_splits)

    for i, (if_index, oof_index) in enumerate(skf.split(X_train, y_train)):
        
        lgb_if_dataset = lgb.Dataset(data=X_train.iloc[if_index],
                                        label=y_train.iloc[if_index],
                                        free_raw_data=False)
        
        lgb_oof_dataset = lgb.Dataset(data=X_train.iloc[oof_index],
                                        label=y_train.iloc[oof_index],
                                        free_raw_data=False)

        lgb_model = lgb.train(lgb_params,
                                lgb_if_dataset,
                                valid_sets=lgb_oof_dataset,
                                callbacks=[lgb.early_stopping(10, verbose=False)],
                                feval = lgb_custom_metric_kappa
                                )
        
        scores_ensemble = scores_ensemble + lgb_model.predict(X_test) #prediction!!!!
        
        score_folds = score_folds + cohen_kappa_score(y_train.iloc[oof_index], 
                                                            lgb_model.predict(X_train.iloc[oof_index]).argmax(axis=1),weights = 'quadratic')/n_splits


    predicted_filename = os.path.join(PATH_TO_TEMP_FILES,f'test_{trial.study.study_name}_{trial.number}.joblib')
    predicted_df = test.copy()
    predicted_df['pred'] = [scores_ensemble[p,:] for p in range(scores_ensemble.shape[0])]
    dump(predicted_df, predicted_filename)
    upload_artifact(trial, predicted_filename, artifact_store)    

    cm_filename = os.path.join(PATH_TO_TEMP_FILES,f'cm_{trial.study.study_name}_{trial.number}.jpg')
    plot_confusion_matrix(y_test,scores_ensemble.argmax(axis=1)).write_image(cm_filename)
    upload_artifact(trial, cm_filename, artifact_store)

    test_score = cohen_kappa_score(y_test,scores_ensemble.argmax(axis=1),weights = 'quadratic')
    trial.set_user_attr("test_score", test_score)

    return(score_folds)

In [25]:
artifact_store = FileSystemArtifactStore(base_path=PATH_TO_OPTUNA_ARTIFACTS)

study = optuna.create_study(direction='maximize',
                            storage="sqlite:///db.sqlite3",  # Specify the storage URL here.
                            study_name="04 - LGB Multiclass CV",
                            load_if_exists = True)

study.optimize(cv_es_lgb_objective, n_trials=100)


FileSystemArtifactStore is experimental (supported from v3.3.0). The interface can change in the future.

[I 2024-06-19 18:25:24,752] Using an existing study with name '04 - LGB Multiclass CV' instead of creating a new one.

upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.


upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.

[I 2024-06-19 18:25:40,358] Trial 2 finished with value: 0.3395239660335658 and parameters: {'lambda_l1': 8.868575952232886, 'lambda_l2': 1.983358513799663e-08, 'num_leaves': 4, 'feature_fraction': 0.9196583664867848, 'bagging_fraction': 0.5647838736487406, 'bagging_freq': 3, 'min_child_samples': 16}. Best is trial 2 with value: 0.3395239660335658.

upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.


upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.

[I 2024-06-19 18:25:50,157] 

In [27]:
!optuna-dashboard sqlite:///db.sqlite3 --artifact-dir ../work/optuna_artifacts/

^C


In [None]:
!optuna-dashboard sqlite:///db.sqlite3 --artifact-dir C:\\\Users\\Usuario\\Documents\\Universidad\\austral\\2024\\lab2\\repos\\Personal\\UA_MDM_LDI_II\\work\\optuna_artifacts

In [27]:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-win_amd64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-win_amd64.whl (65.9 MB)
   ---------------------------------------- 0.0/65.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/65.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/65.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/65.9 MB 262.6 kB/s eta 0:04:12
   ---------------------------------------- 0.0/65.9 MB 279.3 kB/s eta 0:03:56
   ---------------------------------------- 0.1/65.9 MB 847.9 kB/s eta 0:01:18
   ---------------------------------------- 0.4/65.9 MB 2.1 MB/s eta 0:00:32
    --------------------------------------- 1.0/65.9 MB 4.3 MB/s eta 0:00:16
   - -------------------------------------- 1.9/65.9 MB 6.8 MB/s eta 0:00:10
   -- ------------------------------------- 3.4/65.9 MB 10.2 MB/s eta 0:00:07
   -- ------------------------------------- 4.8/65.9 MB 13.3 MB/s eta 0:00:05
   --- -----