In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [40]:
# Create a DataFrame
np.random.seed(0)
n = 1000  # Number of samples
df = pd.DataFrame({
    'num1': np.random.randn(n),
    'num2': np.random.randn(n),
    'num3': np.random.randn(n),
    'num4': np.random.randn(n),
    'num5': np.random.randn(n),
    'cat1': np.random.choice(['A', 'B', 'C'], n),
    'cat2': np.random.choice(['D', 'E', 'F'], n),
    'cat3': np.random.choice(['G', 'H', 'I'], n),
    'cat4': np.random.choice(['J', 'K', 'L'], n),
    'cat5': np.random.choice(['M', 'N', 'O'], n),
    'target': np.random.randn(n)
})

In [41]:
# Split the DataFrame into training and testing sets
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
# Function to perform arithmetic operations
def arithmetic_operations(X):
    X_transformed = pd.DataFrame(X, columns=['num1', 'num2', 'num3', 'num4', 'num5'])
    X_transformed['num1_times_num2'] = X_transformed['num1'] * X_transformed['num2']
    return X_transformed.values

In [43]:
# Splitting the data
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [44]:
# Create pipelines
num_pipeline = Pipeline([
    ('arith', FunctionTransformer(arithmetic_operations, validate=False)),
    ('scaler', MinMaxScaler()),
])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

In [45]:
# Combine pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, ['num1', 'num2', 'num3', 'num4', 'num5']),
        ('cat', cat_pipeline, ['cat1', 'cat2', 'cat3', 'cat4', 'cat5']),
    ])

# Full pipeline
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_regressor', XGBRegressor(eval_metric='rmse')),
])

In [46]:
# Grid search setup
param_grid = {
    'xgb_regressor__n_estimators': [50, 100, 150],
    'xgb_regressor__learning_rate': [0.01, 0.1, 0.2],
}

In [47]:
grid_search = GridSearchCV(full_pipeline, param_grid, cv=KFold(n_splits=5), scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [48]:
# Get the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

In [51]:
# Evaluate the model
mse = round(mean_squared_error(y_test, y_pred),3)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.886


> LGBM

In [14]:
# Use ColumnTransformer to apply the pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, ['num1', 'num2', 'num3', 'num4', 'num5']),
        ('cat', cat_pipeline, ['cat1', 'cat2', 'cat3', 'cat4', 'cat5'])
    ])

# Create a full pipeline with a LightGBM regressor
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lgbm_regressor', LGBMRegressor())
])

# Hyperparameter grid for LightGBM
param_grid = {
    'lgbm_regressor__n_estimators': [50, 100, 150],
    'lgbm_regressor__learning_rate': [0.01, 0.1, 0.2]
}

# Perform k-fold cross-validation and grid search
grid_search = GridSearchCV(full_pipeline, param_grid, cv=KFold(n_splits=5), scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000540 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1314
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 21
[LightGBM] [Info] Start training from score -0.003408
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1314
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 21
[LightGBM] [Info] Start training from score -0.024333
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000233 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1314
[LightGBM] [Info] Number of data points in the train set: 