In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [3]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.7.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-24.7.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.7.0 scikit-optimize-0.10.2


In [2]:
pip install scikit-learn



In [15]:
# Load the dataset
df = pd.read_csv('/content/TrainingSet.csv')

# Define target and features
target = 'TAVG'
X = df.drop(columns=[target])
y = df[target]

# Handling missing values using SimpleImputer
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(exclude=[np.number]).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [16]:
# Define the model
model = XGBRegressor(objective='reg:squarederror', random_state=42,tree_method='gpu_hist', predictor='gpu_predictor')

# Define hyperparameter space for Bayesian optimization
param_space = {
    'n_estimators': (100, 1000),
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'gamma': (0, 0.5),
    'reg_alpha': (0, 10),
    'reg_lambda': (0, 10)
}

In [17]:
# Define the Bayesian search
opt = BayesSearchCV(
    estimator=model,
    search_spaces=param_space,
    n_iter=50,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Create a pipeline that preprocesses data and applies the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('opt', opt)])

In [18]:

# Train the model with Bayesian optimization
pipeline.fit(X_train, y_train)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



In [19]:
# Make predictions and evaluate
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Absolute Error on test set: {mae}')

Mean Absolute Error on test set: 1.5525494784843632



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [20]:
# Load the new unseen dataset (make sure it has the same features as the training set)
new_data = pd.read_csv('/content/TestingSet.csv')

# Make predictions
predictions = pipeline.predict(new_data)


In [21]:
len(predictions)

203

In [22]:
predictions


array([ 1.9046397e+00,  7.4252682e+00,  9.9139233e+00,  1.0270355e+01,
        1.9727337e+01, -6.3935661e-01,  2.0529676e+01,  5.5298662e+00,
        8.2759018e+00,  2.1181988e+01,  8.0826635e+00,  9.1458807e+00,
        6.8780413e+00,  2.4506933e+01,  1.9900032e+01, -2.5185137e+00,
        1.8730412e+01,  2.7091372e+01,  7.1392822e-01,  4.5528164e+00,
        2.4522793e+01, -5.3180876e+00,  1.6450548e+00, -5.0626898e+00,
        1.7375690e+01, -7.5574303e+00,  1.7123137e+00,  2.5217867e+01,
        8.6674194e+00,  2.3125010e+00,  2.4708096e+01,  2.2509666e+01,
        2.0260834e+01,  2.2472752e+01,  4.1401768e-01,  2.0633717e+01,
        2.3258303e+01,  2.2709631e+01, -1.0485098e+01, -9.8411160e+00,
        2.0939611e+01, -7.0990086e+00, -7.8205738e+00,  2.1301380e+01,
        1.4674505e+01,  2.7245796e+01,  1.9043362e+01, -1.2780012e+01,
       -1.7640863e+01, -5.1162386e+00, -6.4334869e-03,  2.4012142e+01,
        2.3743061e+01, -1.2971697e+00,  4.0373764e+00,  1.9010889e+01,
      

In [23]:
import numpy as np
import pandas as pd



# Create a DataFrame
df = pd.DataFrame({
    'INDEX': np.arange(len(predictions)),  # Index column
    'TAVG': predictions                  # TAVG column
})

# Write to CSV file
df.to_csv('output102.csv', index=False)