In [11]:
# Import Packages

import pandas as pd
import numpy as np



from sklearn.preprocessing import StandardScaler, MinMaxScaler,OneHotEncoder,LabelEncoder


from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

import optuna

from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score
# from catboost import CatBoostClassifier
from xgboost import XGBClassifier


In [2]:
# Read the data from the "Data" folder
train_data_label = pd.read_csv('Data/training_set_labels.csv')
train_data_values = pd.read_csv('Data/training_set_values.csv')


In [27]:
X_train = train_data_values.copy()
y_train = train_data_label[['status_group']].values.ravel()


### XGboost

In [28]:
#### ---------------------- complete steps to go through one iteration ---------------------- ####

# Encoding the target variable
target_encoder = LabelEncoder().fit(y_train)
y_train_encoded = target_encoder.transform(y_train)


# Columns to drop and to apply one-hot encoding
columns_to_drop = ['id','date_recorded']
columns_to_encode = ['funder','installer','wpt_name','basin','subvillage','region','lga','ward','public_meeting'
                     ,'recorded_by','scheme_management','scheme_name','permit','extraction_type','extraction_type_group'
                     ,'extraction_type_class','management','management_group','payment','payment_type','water_quality'
                     ,'quality_group','quantity','quantity_group','source','source_type','source_class','waterpoint_type'
                     ,'waterpoint_type_group']

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('drop', 'drop', columns_to_drop),
        ('onehot', OneHotEncoder(), columns_to_encode)
    ],
    remainder='passthrough'  # keeps the columns not specified in transformers
)
# Define the CatBoost model
xg_model = XGBClassifier(verbose=False)
# Create pipeline
pipeline_1 = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xg_model)])


# Perform cross-validation and then calculate scores
pipeline_1.fit(X_train, y_train_encoded)



Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [None]:
# # Make predictions
# predictions_encoded = pipeline_1.predict(X_test)

# predictions = target_encoder.inverse_transform(predictions_encoded)

In [38]:
####### ---------------------- optuna to tune ---------------------- #######

# Encoding the target variable
target_encoder = LabelEncoder().fit(y_train)
y_train_encoded = target_encoder.transform(y_train)
def objective(trial,X,y):
    # Hyperparameters to tune

    param = {
        'objective': 'multi:softprob',
        'num_class': 3,
        'eval_metric': 'mlogloss',  
        'n_estimators': trial.suggest_int("n_estimators", 100, 1000),  
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        'max_depth': trial.suggest_int("max_depth", 3, 10),  
        'min_child_weight': trial.suggest_int("min_child_weight", 1, 10),  
        'subsample': trial.suggest_float("subsample", 0.5, 1.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),  
        'gamma': trial.suggest_float("gamma", 0, 5),  
        'reg_lambda': trial.suggest_float("reg_lambda", 1e-2, 1.0, log=True),  
        'reg_alpha': trial.suggest_float("reg_alpha", 1e-2, 1.0, log=True)  
    }
    

    columns_to_drop = ['id','date_recorded']
    columns_to_encode = ['funder','installer','wpt_name','basin','subvillage','region','lga','ward','public_meeting'
                        ,'recorded_by','scheme_management','scheme_name','permit','extraction_type','extraction_type_group'
                        ,'extraction_type_class','management','management_group','payment','payment_type','water_quality'
                        ,'quality_group','quantity','quantity_group','source','source_type','source_class','waterpoint_type'
                        ,'waterpoint_type_group']
    # Column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('drop', 'drop', columns_to_drop),
            ('onehot', OneHotEncoder(handle_unknown='ignore'), columns_to_encode)

        ],
        remainder='passthrough'  # keeps the columns not specified in transformers
    )

    xg_model = XGBClassifier(**param)
    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', xg_model)])

    ########### CHANGE CV to higher number too - my laptop is crap ##################
    scores = cross_val_score(pipeline, X, y, cv=3, scoring="f1_macro")


    return np.mean(scores)


# Create a study object and specify the direction is 'maximize'.
study = optuna.create_study(direction='maximize')

# Start the optimization

########### CHANGE ntrails to higher number too - my laptop is crap ##################
study.optimize(lambda trial: objective(trial, X_train, y_train_encoded), n_trials=1,  gc_after_trial=True)


print(study.best_params)

[I 2024-01-12 12:28:08,446] A new study created in memory with name: no-name-5510c16e-3290-42d5-8113-39965277ea5d


[I 2024-01-12 12:33:26,562] Trial 0 finished with value: 0.4793813316645279 and parameters: {'n_estimators': 504, 'learning_rate': 0.0027992002193266406, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.829830554105345, 'colsample_bytree': 0.8558971605279979, 'gamma': 1.6857607369227356, 'reg_lambda': 0.9021657559978916, 'reg_alpha': 0.11894007301599628}. Best is trial 0 with value: 0.4793813316645279.


{'n_estimators': 504, 'learning_rate': 0.0027992002193266406, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.829830554105345, 'colsample_bytree': 0.8558971605279979, 'gamma': 1.6857607369227356, 'reg_lambda': 0.9021657559978916, 'reg_alpha': 0.11894007301599628}
