# Categorical Feature Encoding Challenge II

In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import category_encoders as ce

Function to load train and test datasets

In [None]:
def load_data():
    train = pd.read_csv("data/cfec_train.csv")
    test = pd.read_csv("data/cfec_test.csv")
    target_col = "target" if "target" in train.columns else train.columns[-1]
    return train, test, target_col


train, test, target_col = load_data()

In [16]:
categorical_cols = train.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_cols.remove(target_col)

In [17]:
imputer = SimpleImputer(strategy="most_frequent")
train[categorical_cols] = imputer.fit_transform(train[categorical_cols])
test[categorical_cols] = imputer.transform(test[categorical_cols])

In [18]:
encoder = ce.TargetEncoder(cols=categorical_cols)
train[categorical_cols] = encoder.fit_transform(
    train[categorical_cols], train[target_col]
)
test[categorical_cols] = encoder.transform(test[categorical_cols])

In [19]:
scaler = StandardScaler()
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])
test[numerical_cols] = scaler.transform(test[numerical_cols])

In [20]:
X = train.drop(columns=[target_col])
y = train[target_col]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Function to train and evaluate a model using accuracy score

In [9]:
def evaluate_model(model, X_train, X_val, y_train, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    print(f"Model: {model.__class__.__name__} | Accuracy: {score:.4f}")
    return score

Model Evaluation using accuracy score

In [22]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(eval_metric="logloss")
lgbm = LGBMClassifier()

evaluate_model(rf, X_train, X_val, y_train, y_val)
evaluate_model(xgb, X_train, X_val, y_train, y_val)
evaluate_model(lgbm, X_train, X_val, y_train, y_val)

Model: RandomForestClassifier | Accuracy: 0.8218
Model: XGBClassifier | Accuracy: 0.8255
[LightGBM] [Info] Number of positive: 89858, number of negative: 390142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1789
[LightGBM] [Info] Number of data points in the train set: 480000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187204 -> initscore=-1.468280
[LightGBM] [Info] Start training from score -1.468280
Model: LGBMClassifier | Accuracy: 0.8272


0.8272

In [23]:
param_grid_xgb = {
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.1],
    "n_estimators": [100, 200],
}

grid_xgb = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    param_grid_xgb,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1,
)
grid_xgb.fit(X_train, y_train)
print(f"Best XGBoost params: {grid_xgb.best_params_}")

Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.



Best XGBoost params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}


LightGBM Hyperparameter Tuning

In [24]:
param_grid_lgbm = {
    "num_leaves": [20, 31],
    "learning_rate": [0.01, 0.05],
    "n_estimators": [100, 300],
}

random_lgbm = RandomizedSearchCV(
    LGBMClassifier(),
    param_grid_lgbm,
    n_iter=5,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1,
)
random_lgbm.fit(X_train, y_train)
print(f"Best LightGBM params: {random_lgbm.best_params_}")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[LightGBM] [Info] Number of positive: 89858, number of negative: 390142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1789
[LightGBM] [Info] Number of data points in the train set: 480000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187204 -> initscore=-1.468280
[LightGBM] [Info] Start training from score -1.468280
Best LightGBM params: {'num_leaves': 20, 'n_estimators': 300, 'learning_rate': 0.05}


Final LightGBM Model Training and Evaluation

In [25]:
final_model = LGBMClassifier(**random_lgbm.best_params_)
final_model.fit(X_train, y_train)
y_final_pred = final_model.predict(X_val)

final_acc = accuracy_score(y_val, y_final_pred)
print(f"Final Model Accuracy: {final_acc:.4f}")

[LightGBM] [Info] Number of positive: 89858, number of negative: 390142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1789
[LightGBM] [Info] Number of data points in the train set: 480000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187204 -> initscore=-1.468280
[LightGBM] [Info] Start training from score -1.468280
Final Model Accuracy: 0.8274


# Regression with a Tabular Gemstone Price Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import category_encoders as ce

In [2]:
gemstone = pd.read_csv("data/gemstone.csv")

X = gemstone.drop(columns=["price"])
y = gemstone["price"]

Handling Missing Values in Categorical Columns

In [3]:
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

imputer = SimpleImputer(strategy="most_frequent")
X[categorical_cols] = imputer.fit_transform(X[categorical_cols])

In [4]:
encoder = ce.TargetEncoder(cols=categorical_cols)
X[categorical_cols] = encoder.fit_transform(X[categorical_cols], y)

In [5]:
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

Encoding Ordinal Categorical Features

In [6]:
X["cut"] = X["cut"].map({"Very Good": 4, "Good": 3, "Fair": 2, "Poor": 1})
X["color"] = X["color"].map({"D": 7, "E": 6, "F": 5, "G": 4, "H": 3, "I": 2, "J": 1})
X["clarity"] = X["clarity"].map(
    {"IF": 8, "VVS1": 7, "VVS2": 6, "VS1": 5, "VS2": 4, "SI1": 3, "SI2": 2, "I1": 1}
)

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Model Evaluation Function

In [8]:
def evaluate_model(model, X_train, X_val, y_train, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    print(f"Model: {model.__class__.__name__} | Accuracy: {score:.4f}")
    return score

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

rf = RandomForestRegressor(n_estimators=10, random_state=42)
xgb = XGBRegressor()
lgbm = LGBMRegressor()

evaluate_model(rf, X_train, X_val, y_train, y_val)
evaluate_model(xgb, X_train, X_val, y_train, y_val)
evaluate_model(lgbm, X_train, X_val, y_train, y_val)

In [None]:
param_grid_xgb = {
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.1],
    "n_estimators": [100, 200],
}

grid_xgb = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    param_grid_xgb,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1,
)

In [None]:
param_grid_lgbm = {
    "num_leaves": [20, 31],
    "learning_rate": [0.01, 0.05],
    "n_estimators": [100, 300],
}

random_lgbm = RandomizedSearchCV(
    LGBMClassifier(),
    param_grid_lgbm,
    n_iter=5,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1,
)

In [None]:
final_model = LGBMClassifier(**random_lgbm.best_params_)
final_model.fit(X_train, y_train)
y_final_pred = final_model.predict(X_val)

final_acc = accuracy_score(y_val, y_final_pred)
print(f"Final Model Accuracy: {final_acc:.4f}")