In [55]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import StratifiedKFold


X_data = pd.read_csv('X_data.csv')
y_data = pd.read_csv('y_data.csv')

In [56]:
fold_number = 5

In [58]:
def objective(trial):
    params = {
        "num_leaves" : trial.suggest_int("num_leaves", 300, 700),
        "max_depth" : trial.suggest_int("max_depth", 10, 60),
        "learning_rate" : trial.suggest_float("learning_rate", 0.001, 1.0, log=True),
        "n_estimators" : trial.suggest_int("n_estimators", 500, 1000),
        "subsample" : trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree" : trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha" : trial.suggest_float("reg_alpha", 1e-7, 0.1, log=True),
        "reg_lambda" : trial.suggest_float("reg_lambda", 1e-3, 1e1, log=True),
        "min_data_in_leaf" : trial.suggest_int("min_data_in_leaf", 40, 120),
        "feature_fraction" : trial.suggest_float("feature_fraction", 0.5, 1.0),
        "data_sample_strategy" : "bagging",
        "bagging_fraction" : trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq" : trial.suggest_int("bagging_freq", 1, 5),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-4, 1e2, log=True),
        'cat_smooth': trial.suggest_float('cat_smooth', 10, 100, log=True),

        "log_period" : 1000,
        "random_state" : 129,
        "verbose" : -1}
    train_predictions = np.zeros(188533)

    skf = StratifiedKFold(n_splits=fold_number, shuffle=True, random_state=42)

    for fold_idx, (train_indices, val_indices) in enumerate(skf.split(X_data.head(188533), y_data)):
        X_train_fold, y_train_fold = X_data.iloc[train_indices], y_data.iloc[train_indices]
        X_val_fold, y_val_fold = X_data.iloc[val_indices], y_data.iloc[val_indices]

        train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
        valid_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)

        model = lgb.train(
            params,
            train_data,
            num_boost_round=params['n_estimators'],
            valid_sets=[valid_data],
            callbacks=[lgb.early_stopping(stopping_rounds=50)]
        )

        train_predictions[val_indices] = model.predict(X_val_fold, num_iteration=model.best_iteration)

    rmse = mean_squared_error(train_predictions, y_data) ** 0.5

    return(rmse)
sampler = TPESampler(seed=10)  # Optional: Set a seed for reproducibility
study = optuna.create_study(sampler=sampler, direction='minimize')
study.optimize(objective, n_trials = 500)

[I 2024-09-21 21:07:50,939] A new study created in memory with name: no-name-946c926b-69d8-4f9d-b58f-e3a2bc534b8c




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	valid_0's l2: 5.26088e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[38]	valid_0's l2: 5.25629e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	valid_0's l2: 5.24277e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[46]	valid_0's l2: 5.2184e+09




Training until validation scores don't improve for 50 rounds


[I 2024-09-21 21:08:05,799] Trial 0 finished with value: 72436.92451000314 and parameters: {'num_leaves': 609, 'max_depth': 11, 'learning_rate': 0.07960579883076778, 'n_estimators': 875, 'subsample': 0.7492535061512953, 'colsample_bytree': 0.6123983227654238, 'reg_alpha': 1.54304001490974e-06, 'reg_lambda': 1.1018509458263566, 'min_data_in_leaf': 53, 'feature_fraction': 0.5441699070870052, 'bagging_fraction': 0.8426799091838986, 'bagging_freq': 5, 'min_child_weight': 0.00010560624429728648, 'cat_smooth': 32.52312463051321}. Best is trial 0 with value: 72436.92451000314.


Early stopping, best iteration is:
[39]	valid_0's l2: 5.2572e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	valid_0's l2: 5.2795e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	valid_0's l2: 5.25646e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[14]	valid_0's l2: 5.25919e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[21]	valid_0's l2: 5.25018e+09




Training until validation scores don't improve for 50 rounds


In [51]:
fold_size = 188533 // fold_number
indices = np.arange(188533)
train_predictions = np.zeros(188533)
test_predictions = np.zeros(125690)
np.random.shuffle(indices)
for fold_idx in range(fold_number):
    val_indices = indices[fold_idx * fold_size:(fold_idx + 1) * fold_size]
    train_indices = np.concatenate([indices[:fold_idx * fold_size], indices[(fold_idx + 1) * fold_size:]])

    X_train_fold, y_train_fold = X_data.iloc[train_indices], y_data.iloc[train_indices]
    X_val_fold, y_val_fold = X_data.iloc[val_indices], y_data.iloc[val_indices]

    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    valid_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    model = lgb.train(
        params,
        train_data,
        num_boost_round=params['n_estimators'],
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )
    train_predictions[val_indices] = model.predict(X_val_fold, num_iteration=model.best_iteration)
    test_predictions += model.predict(X_data[188533 : (188533+125690)], num_iteration = model.best_iteration) * (1 / fold_number)
rmse = mean_squared_error(train_predictions, y_data)**(0.5)



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[320]	valid_0's l2: 5.11776e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[193]	valid_0's l2: 4.1753e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[219]	valid_0's l2: 5.50358e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[269]	valid_0's l2: 6.15682e+09




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[216]	valid_0's l2: 5.19613e+09


In [52]:
print(rmse)

72317.7630552436


In [54]:
y_pred = pd.Series(test_predictions)
for idx in range(len(y_pred)):
    if(y_pred[idx] < 9000):
        y_pred[idx] = 9000
id = np.array([idx for idx in range(188533, 314223)])
prediction = pd.DataFrame({'id' : id, 'price' : y_pred})
prediction.to_csv('09212024carsprediction1.csv', index = False)