In [None]:
"""
# Google Colab requirements..
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/pipe/mtuci-itprog-pipe
"""

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [12]:
df = pd.read_csv("Laptop_price.csv")

y = df[["Price"]]
X = df.drop("Price", axis=1)

splitted = train_test_split(X, y, test_size=0.33)
X_train, X_test, y_train, y_test = splitted

In [23]:
num_cols = list(X_train.select_dtypes(exclude=["object"]).columns)
cat_cols = list(X_train.select_dtypes(include=["object"]).columns)
print("Количественные колонки:", num_cols)
print("Категориальные колонки:", cat_cols)

Количественные колонки: ['Processor_Speed', 'RAM_Size', 'Storage_Capacity', 'Screen_Size', 'Weight']
Категориальные колонки: ['Brand']


In [27]:
numerical = Pipeline(steps=[
    ("simple_imputer", SimpleImputer()),  # default strategy="mean"
    ("power_transform", PowerTransformer()),
    ("Scaler", StandardScaler()),
])
categorcial = Pipeline(steps=[
    ("simple_imputer", SimpleImputer(strategy="most_frequent")),
    ("OneHotEncoder", OneHotEncoder(handle_unknown="ignore",
                                    sparse_output=False)),
])

In [28]:
ct = ColumnTransformer([
    ("numerical", numerical, num_cols),
    ("categorical", categorcial, cat_cols),
])
pipe = Pipeline(steps=[
    ("ct", ct),
    ("XGBRegressor", XGBRegressor()),
])

In [41]:
param_grid = {
    "XGBRegressor__n_estimators": [5, 10, 50],
    "XGBRegressor__learning_rate": [0, 1],
    "XGBRegressor__max_depth": [5],
    "XGBRegressor__gamma": [10.0, 5.0, 0.1, 0.01],
    "XGBRegressor__n_jobs": [1, 2],  # number of parallel threads used to xgboost
    "ct__numerical": [StandardScaler(), RobustScaler()],
}
clf = GridSearchCV(pipe, param_grid, verbose=0)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

print("\tBest estimator", clf.best_estimator_, sep="\n")
print("\tBest parameters", clf.best_params_, sep="\n")

	Best estimator
Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('numerical', StandardScaler(),
                                                  ['Processor_Speed',
                                                   'RAM_Size',
                                                   'Storage_Capacity',
                                                   'Screen_Size', 'Weight']),
                                                 ('categorical',
                                                  Pipeline(steps=[('simple_imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('OneHotEncoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                           