In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import lightgbm as lgb

import optuna

In [2]:
# Load the data
train_data = pd.read_csv('internship_train.csv')
submission_data = pd.read_csv('internship_hidden_test.csv')

In [3]:
# Split the training data into features and target
X_train_all = train_data.drop('target', axis=1)
y_train_all = train_data['target']

In [4]:
# Split the test data into features
X_submission = submission_data

In [5]:
# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_all = scaler.fit_transform(X_train_all)
X_submission = scaler.transform(X_submission)

In [6]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=42)

In [7]:
# Split the training data into training and tests sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [8]:
%%time
# Train and evaluate a Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)
mse_lr = mean_squared_error(y_val, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
print('Linear Regression RMSE on val data:', rmse_lr)

Linear Regression RMSE on val data: 29.018063147758934
CPU times: user 309 ms, sys: 63.8 ms, total: 372 ms
Wall time: 153 ms


In [9]:
%%time
# Train and evaluate a Random Forest Regressor model
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
y_pred_rfr = rfr.predict(X_val)
mse_rfr = mean_squared_error(y_val, y_pred_rfr)
rmse_rfr = np.sqrt(mse_rfr)
print('Random Forest Regressor RMSE on val data:', rmse_rfr)

Random Forest Regressor RMSE on val data: 0.004382381328900737
CPU times: user 5min 15s, sys: 582 ms, total: 5min 16s
Wall time: 5min 16s


In [10]:
%%time
# Train and evaluate a Gradient Boosting Regressor model
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
mse_gb = mean_squared_error(y_val, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
print('Gradient Boosting Regressor RMSE on val data:', rmse_gb)

Gradient Boosting Regressor RMSE on val data: 0.4172525678617058
CPU times: user 2min 28s, sys: 20.8 ms, total: 2min 28s
Wall time: 2min 29s


In [11]:
%%time
# Train and evaluate a K-Nearest Neighbors Regression model
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred_knn))
print('KNN RMSE on val data:', rmse)

KNN RMSE on val data: 29.878087628237203
CPU times: user 28.6 s, sys: 9.95 s, total: 38.5 s
Wall time: 24 s


In [12]:
# Convert the data into LightGBM format
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Set the hyperparameters for the LightGBM model
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

lgbm_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[val_data],
    early_stopping_rounds=50,
    verbose_eval=100
)

y_pred_lgbm = lgbm_model.predict(X_val, num_iteration=lgbm_model.best_iteration)

rmse_lgbm = np.sqrt(mean_squared_error(y_val, y_pred_lgbm))
print('LightGBM RMSE on val data:', rmse_lgbm)



You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.407449
[200]	valid_0's rmse: 0.11742
[300]	valid_0's rmse: 0.0855978
[400]	valid_0's rmse: 0.0765465
[500]	valid_0's rmse: 0.0726728
[600]	valid_0's rmse: 0.0705897
[700]	valid_0's rmse: 0.069589
[800]	valid_0's rmse: 0.06782
[900]	valid_0's rmse: 0.0669251
[1000]	valid_0's rmse: 0.0662841
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.0662841
LightGBM RMSE on val data: 0.06628414576913776


Найкраще себе показують lightgbm і RandomForestRegressor тому попробую потюнити їх гіперпараметри

In [13]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': trial.suggest_int('num_leaves', 25, 35),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.99, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.96, 0.98),
        'bagging_freq': trial.suggest_int('bagging_freq', 2, 5),
        'verbose': -1
    }

    lgbm_model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    y_pred = lgbm_model.predict(X_val, num_iteration=lgbm_model.best_iteration)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    return rmse


# Use Optuna to optimize the hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Train the LightGBM model on the optimized hyperparameters
params = study.best_params
params['objective'] = 'regression'
params['metric'] = 'rmse'
lgbm_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[val_data],
    early_stopping_rounds=50,
    verbose_eval=100
)

y_pred_lgbm = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
rmse_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
print('Optuna LightGBM RMSE on test data:', rmse_lgbm)

[32m[I 2023-03-08 20:55:57,405][0m A new study created in memory with name: no-name-d67e9f2d-b677-43cd-a67f-c159f7fdabfe[0m
[32m[I 2023-03-08 20:56:23,191][0m Trial 0 finished with value: 0.01018897925828731 and parameters: {'num_leaves': 25, 'learning_rate': 0.014507911197283497, 'feature_fraction': 0.9949359934577625, 'bagging_fraction': 0.9756681726405361, 'bagging_freq': 2}. Best is trial 0 with value: 0.01018897925828731.[0m
[32m[I 2023-03-08 20:56:45,083][0m Trial 1 finished with value: 0.012911385414901794 and parameters: {'num_leaves': 35, 'learning_rate': 0.09811474709987661, 'feature_fraction': 0.9962058601148728, 'bagging_fraction': 0.9709958048160401, 'bagging_freq': 4}. Best is trial 0 with value: 0.01018897925828731.[0m
[32m[I 2023-03-08 20:57:04,636][0m Trial 2 finished with value: 0.010512423082444284 and parameters: {'num_leaves': 30, 'learning_rate': 0.05109208522663365, 'feature_fraction': 0.9957457217431407, 'bagging_fraction': 0.9633439427022815, 'baggin

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13192
[LightGBM] [Info] Number of data points in the train set: 64800, number of used features: 53
[LightGBM] [Info] Start training from score 50.124621
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 3.61838
[200]	valid_0's rmse: 0.472682
[300]	valid_0's rmse: 0.0705863
[400]	valid_0's rmse: 0.0168694
[500]	valid_0's rmse: 0.0108087
[600]	valid_0's rmse: 0.00957062
[700]	valid_0's rmse: 0.00907328
[800]	valid_0's rmse: 0.00892683
[900]	valid_0's rmse: 0.00881567
[1000]	valid_0's rmse: 0.00876344
Did not meet early stopping. Best iteration is:
[996]	valid_0's rmse: 0.00876314
Optuna LightGBM RMSE on test data: 0.008847919409873142


In [14]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 90, 110),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 4),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 2),
        'random_state': 42,
        'n_jobs': -1
    }

    rfr_model = RandomForestRegressor(**params)
    rfr_model.fit(X_train, y_train)
    y_pred = rfr_model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    return rmse

# Use Optuna to optimize the hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Train the Random Forest Regressor model on the optimized hyperparameters
params = study.best_params
params['random_state'] = 42
params['n_jobs'] = -1
rfr_model = RandomForestRegressor(**params)
rfr_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_rfr = rfr_model.predict(X_test)

# Calculate the RMSE on the validation set
rmse_rfr = np.mean(mean_squared_error(y_test, y_pred_rfr))
print('Optuna Random Forest Regressor RMSE on test data:', rmse_rfr)

[32m[I 2023-03-08 21:38:05,357][0m A new study created in memory with name: no-name-95fc0e49-c3aa-410a-8b7e-51cec1cddfde[0m
[32m[I 2023-03-08 21:42:47,448][0m Trial 0 finished with value: 0.004377300606527633 and parameters: {'n_estimators': 90, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.004377300606527633.[0m
[32m[I 2023-03-08 21:47:59,423][0m Trial 1 finished with value: 0.004377071416344892 and parameters: {'n_estimators': 101, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.004377071416344892.[0m
[32m[I 2023-03-08 21:52:45,225][0m Trial 2 finished with value: 0.004334586591159707 and parameters: {'n_estimators': 95, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.004334586591159707.[0m
[32m[I 2023-03-08 21:58:05,263][0m Trial 3 finished with value: 0.004313231846170207 and parameters: {'n_estimators': 106, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 3 with 

Optuna Random Forest Regressor RMSE on test data: 1.7666407172451666e-05


Отже всіх переміг RandomForestRegressor, натреную його на всіх даних із гіперпараметрами з optuna
і зроблю предсказання на сабмішин дані(тобто на дані для яких немає target), я тут ввів поняття сабмішин
дані бо використовую після підбору гіперпараметрів на валідації окремо сформовану тестову вибірку щоб уникнути
перенавчання на валідації

In [15]:
rfr_final = RandomForestRegressor(**params)
rfr_final.fit(X_train_all, y_train_all)
y_pred_final = rfr.predict(X_submission)

In [16]:
pd.Series(y_pred_final).to_csv('submission.csv', index = False, header=False)