#  DT Regressor - One Block Code with Pipline

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

import warnings # for ignoring unnecessary warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

# ------------------------ #

# 1) Load the dataset 
df = pd.read_excel('sample_data.xlsx') 
# Assume the target variable in the Excel file is named "Target"
# Other variable names are not important here.

# 2) Separate features and target variable from the dataset
X = df.drop(columns=["Target"])
y = df["Target"]

# Separate numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# 3) Create transformers for data preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)) ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# 4) Create a pipeline with a Decision Tree Regressor
base_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

# 5) Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6) Train and evaluate the Base Model
base_pipeline.fit(X_train, y_train)

# 7) Predictions and evaluation metrics
print("EVAL METRICS FOR BASE MODEL")
y_train_pred = base_pipeline.predict(X_train)
y_test_pred = base_pipeline.predict(X_test)

def evaluate_model(y_true, y_pred, data_type="Train"):
    print(f"Evaluation metrics for {data_type} data:")
    print(f"Mean Squared Error: {mean_squared_error(y_true, y_pred)}")
    print(f"R2 Score: {r2_score(y_true, y_pred)}")
    print("\n")

evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")

# 8) Hyperparameter Tuning
param_grid = {
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(base_pipeline, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")

# 9) Build and evaluate the Best Model
print("")
print("----EVAL METRICS FOR BEST MODEL----")
print("")

best_pipeline = grid_search.best_estimator_

y_train_best_pred = best_pipeline.predict(X_train)
y_test_best_pred = best_pipeline.predict(X_test)

evaluate_model(y_train_best_pred, y_train, "Train (Best Model)")
evaluate_model(y_test_best_pred, y_test, "Test (Best Model)")

# Compare metrics
metrics_comparison = pd.DataFrame({
    'Metric': ['Mean Squared Error', 'R2 Score'],
    'BaseModlTrain': [mean_squared_error(y_train, y_train_pred), r2_score(y_train, y_train_pred)],
    'BaseModlTest': [mean_squared_error(y_test, y_test_pred), r2_score(y_test, y_test_pred)],
    'BestModTrain': [mean_squared_error(y_train, y_train_best_pred), r2_score(y_train, y_train_best_pred)],
    'BestModlTest': [mean_squared_error(y_test, y_test_best_pred), r2_score(y_test, y_test_best_pred)]
})

print(metrics_comparison)

# 10) Train the Final Model on the entire dataset
final_model = best_pipeline.fit(X, y)

# 11) Save the Final Model
joblib.dump(final_model, 'final_DT_regression_model.pkl')

print(" ")
print("Final model saved as 'final_DT_regression_model.pkl'")


EVAL METRICS FOR BASE MODEL
Evaluation metrics for Train data:
Mean Squared Error: 0.004380208333333334
R2 Score: 0.9998343155835676


Evaluation metrics for Test data:
Mean Squared Error: 2.0806729508196726
R2 Score: 0.9096757041703469


Best Hyperparameters: {'regressor__max_depth': 10, 'regressor__max_features': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2}

----EVAL METRICS FOR BEST MODEL----

Evaluation metrics for Train (Best Model) data:
Mean Squared Error: 0.032808939133986935
R2 Score: 0.9987574370886227


Evaluation metrics for Test (Best Model) data:
Mean Squared Error: 2.093356527607604
R2 Score: 0.8882106122943842


               Metric  BaseModlTrain  BaseModlTest  BestModTrain  BestModlTest
0  Mean Squared Error       0.004380      2.080673      0.032809      2.093357
1            R2 Score       0.999834      0.909676      0.998759      0.909125
 
Final model saved as 'final_DT_regression_model.pkl'


In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

import warnings  # for ignoring unnecessary warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

# ------------------------ #

# 1) Load the dataset 
df = pd.read_excel('sample_data.xlsx') 
# Assume the target variable in the Excel file is named "Target"
# Other variable names are not important here.

# 2) Separate features and target variable from the dataset
X = df.drop(columns=["Target"])
y = df["Target"]

# Separate numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# 3) Create transformers for data preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# 4) Create a pipeline with a KNeighbors Regressor
base_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

# 5) Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6) Train and evaluate the Base Model
base_pipeline.fit(X_train, y_train)

# 7) Predictions and evaluation metrics
print("EVAL METRICS FOR BASE MODEL")
y_train_pred = base_pipeline.predict(X_train)
y_test_pred = base_pipeline.predict(X_test)

def evaluate_model(y_true, y_pred, data_type="Train"):
    print(f"Evaluation metrics for {data_type} data:")
    print(f"Mean Squared Error: {mean_squared_error(y_true, y_pred)}")
    print(f"R2 Score: {r2_score(y_true, y_pred)}")
    print("\n")

evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")

# 8) Hyperparameter Tuning
param_grid = {
    'regressor__n_neighbors': [3, 5, 7, 9],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(base_pipeline, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")

# 9) Build and evaluate the Best Model
print("")
print("----EVAL METRICS FOR BEST MODEL----")
print("")

best_pipeline = grid_search.best_estimator_

y_train_best_pred = best_pipeline.predict(X_train)
y_test_best_pred = best_pipeline.predict(X_test)

evaluate_model(y_train_best_pred, y_train, "Train (Best Model)")
evaluate_model(y_test_best_pred, y_test, "Test (Best Model)")

# Compare metrics
metrics_comparison = pd.DataFrame({
    'Metric': ['Mean Squared Error', 'R2 Score'],
    'BaseModlTrain': [mean_squared_error(y_train, y_train_pred), r2_score(y_train, y_train_pred)],
    'BaseModlTest': [mean_squared_error(y_test, y_test_pred), r2_score(y_test, y_test_pred)],
    'BestModTrain': [mean_squared_error(y_train, y_train_best_pred), r2_score(y_train, y_train_best_pred)],
    'BestModlTest': [mean_squared_error(y_test, y_test_best_pred), r2_score(y_test, y_test_best_pred)]
})

print(metrics_comparison)

# 10) Train the Final Model on the entire dataset
final_model = best_pipeline.fit(X, y)

# 11) Save the Final Model
joblib.dump(final_model, 'final_knn_regression_model.pkl')

print(" ")
print("Final model saved as 'final_knn_regression_model.pkl'")


EVAL METRICS FOR BASE MODEL
Evaluation metrics for Train data:
Mean Squared Error: 19.409700533333332
R2 Score: 0.26581462312646265


Evaluation metrics for Test data:
Mean Squared Error: 23.55598183606557
R2 Score: -0.022591018487807313


Best Hyperparameters: {'regressor__metric': 'manhattan', 'regressor__n_neighbors': 9, 'regressor__weights': 'distance'}

----EVAL METRICS FOR BEST MODEL----

Evaluation metrics for Train (Best Model) data:
Mean Squared Error: 0.004380208333333334
R2 Score: 0.9998342881276927


Evaluation metrics for Test (Best Model) data:
Mean Squared Error: 25.454692442613613
R2 Score: -1.5479686471776968


               Metric  BaseModlTrain  BaseModlTest  BestModTrain  BestModlTest
0  Mean Squared Error      19.409701     23.555982      0.004380     25.454692
1            R2 Score       0.265815     -0.022591      0.999834     -0.105016
 
Final model saved as 'final_knn_regression_model.pkl'
