In [1]:
import pandas
import numpy
import sklearn

from dataclasses import dataclass
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from typing import Protocol, Callable
from enum import Enum

In [2]:
double_dim_converter: Callable[[numpy.ndarray], numpy.ndarray] = lambda x: x.reshape(-1, 1) if x.ndim == 1 else x

In [3]:
@dataclass
class Metrics:
    "class that represents each model test error metrics"
    mse: float
    mae: float
    r2_score: float

class Model(Protocol):

    def fit(self, x: numpy.ndarray, y: numpy.ndarray) -> None:
        "represents fitting the model"
        raise NotImplementedError

    def transform(self, x: numpy.ndarray) -> numpy.ndarray:
        "represents the transform function"
        raise NotImplementedError

    def fit_transform(self, x: numpy.ndarray, y: numpy.ndarray) -> numpy.ndarray:
        "represents the fit and transform function"
        raise NotImplementedError
        

class ModelTypes(Enum):
    LINEAR = "Linear"
    POLYNOMIAL = "Polynomial"
    RANDOM_FORREST = "Random Forrest"
    RIDGE = "Ridge"
    LASSO = "Lasso"

class DataTransformer(Protocol):
    "Basic representation of the dataset normalizer"

    def fit(self, data: numpy.ndarray) -> None:
        "fit the transformer model"
        raise NotImplementedError

    def fit_transform(self, data: numpy.ndarray) -> numpy.ndarray:
        "fit the transformer model and return the normalized dataset"
        raise NotImplementedError

    def transform(self, data: numpy.ndarray) -> numpy.ndarray:
        "transforms the input data"
        raise NotImplementedError
    
    def inv_transform(self, data: numpy.ndarray) -> numpy.ndarray:
        "inversely transforms the data"
        raise NotImplementedError

class STDScaler:

    def __init__(self) -> None:
        self._transformer: sklearn.preprocessing._data.StandardScaler = StandardScaler()

    def fit(self, data: numpy.ndarray) -> None:
        self._transformer.fit(data)

    def fit_transform(self, data: numpy.ndarray) -> numpy.ndarray:
        return self._transformer.fit_transform(data)

    def transform(self, data: numpy.ndarray) -> numpy.ndarray:
        return self._transformer.transform(data)
    
    def inv_transform(self, data: numpy.ndarray) -> numpy.ndarray:
        return self._transformer.inverse_transform(data)

class ModelPipeline(Protocol):
    "represents an end to end pipeline of a model"

    def build_fit(self, feature: numpy.ndarray, target: numpy.ndarray, feature_scaler: DataTransformer, target_scaler: DataTransformer) -> None:
        "build and fits the pipeline"
        raise NotImplementedError
    
    def forward(self, feature: numpy.ndarray) -> numpy.ndarray:
        "calculates the forward path"
        raise NotImplementedError

class LinearModel:

    def __init__(self, hparams: dict[str, int|float]=dict()) -> None:
        self._model: Model = LinearRegression(**hparams)
        self._pipeline: Callable[[numpy.ndarray], numpy.ndarray] 
    
    def build_fit(self, feature: numpy.ndarray, target: numpy.ndarray, feature_scaler: DataTransformer, target_scaler: DataTransformer) -> None:
        self._model.fit(feature_scaler.transform(feature), target_scaler.transform(target))
        self._pipeline = lambda x: target_scaler.inv_transform(double_dim_converter(self._model.predict(feature_scaler.transform(x))))
    
    def forward(self, feature: numpy.ndarray) -> numpy.ndarray:
        return self._pipeline(feature)

class PolynomialModel:

    def __init__(self, hparams: dict[str, int|float]=dict()) -> None:
        self._model: Model = LinearRegression()
        self._pipeline: Callable[[numpy.ndarray], numpy.ndarray] 
        self._poly_creator = PolynomialFeatures(**hparams)
    
    def build_fit(self, feature: numpy.ndarray, target: numpy.ndarray, feature_scaler: DataTransformer, target_scaler: DataTransformer) -> None:
        poly_train_features = self._poly_creator.fit_transform(feature)
        poly_feature_scaler = STDScaler()

        poly_feature_scaler.fit(poly_train_features)
        self._model.fit(poly_feature_scaler.transform(poly_train_features), target_scaler.transform(target))
        self._pipeline = lambda x: target_scaler.inv_transform(double_dim_converter(self._model.predict(poly_feature_scaler.transform(self._poly_creator.transform(x)))))
    
    def forward(self, feature: numpy.ndarray) -> numpy.ndarray:
        return self._pipeline(feature)

class RandomForrestModel:

    def __init__(self, hparams: dict[str, int|float]=dict()) -> None:
        self._model: Model = RandomForestRegressor(**hparams)
        self._pipeline: Callable[[numpy.ndarray], numpy.ndarray] 
    
    def build_fit(self, feature: numpy.ndarray, target: numpy.ndarray, feature_scaler: DataTransformer, target_scaler: DataTransformer) -> None:
        self._model.fit(feature_scaler.transform(feature), target_scaler.transform(target))
        self._pipeline = lambda x: target_scaler.inv_transform(double_dim_converter(self._model.predict(feature_scaler.transform(x))))
    
    def forward(self, feature: numpy.ndarray) -> numpy.ndarray:
        return self._pipeline(feature)

class RidgeModel:

    def __init__(self, hparams: dict[str, int|float]=dict()) -> None:
        self._model: Model = Ridge(**hparams)
        self._pipeline: Callable[[numpy.ndarray], numpy.ndarray] 
    
    def build_fit(self, feature: numpy.ndarray, target: numpy.ndarray, feature_scaler: DataTransformer, target_scaler: DataTransformer) -> None:
        self._model.fit(feature_scaler.transform(feature), target_scaler.transform(target))
        self._pipeline = lambda x: target_scaler.inv_transform(double_dim_converter(self._model.predict(feature_scaler.transform(x))))
    
    def forward(self, feature: numpy.ndarray) -> numpy.ndarray:
        return self._pipeline(feature)

class LassoModel:

    def __init__(self, hparams: dict[str, int|float]=dict()) -> None:
        self._model: Model = Lasso(**hparams)
        self._pipeline: Callable[[numpy.ndarray], numpy.ndarray] 
    
    def build_fit(self, feature: numpy.ndarray, target: numpy.ndarray, feature_scaler: DataTransformer, target_scaler: DataTransformer) -> None:
        self._model.fit(feature_scaler.transform(feature), target_scaler.transform(target))
        self._pipeline = lambda x: target_scaler.inv_transform(double_dim_converter(self._model.predict(feature_scaler.transform(x))))
    
    def forward(self, feature: numpy.ndarray) -> numpy.ndarray:
        return self._pipeline(feature)

In [4]:
DS_PATH = "1.xlsx"
TARGET = "Vs"

TEST_SIZE = 0.2
SEED = 0
POLYNOMIAL_DEGREE = 2
RANDOM_FORREST_MAX_DEPTH = 5
RIDGE_ALPHA = 1
LASSO_ALPHA = 0.1

with open(DS_PATH, "rb") as afile:
    df = pandas.read_excel(afile)

In [5]:
def calculate_err_metrics(y_true: numpy.ndarray, y_pred: numpy.ndarray):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2_metric = r2_score(y_true, y_pred)
    return Metrics(mse, mae, r2_metric)

def data_train_test_split(df: pandas.core.frame.DataFrame, test_size: float, seed: int) -> tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]:

    train_df, test_df = train_test_split(df, test_size=test_size, shuffle=True, random_state=seed)
    train_features = double_dim_converter(train_df.drop(TARGET, axis=1).to_numpy())
    train_targets = double_dim_converter(train_df[TARGET].to_numpy())
    test_features = double_dim_converter(test_df.drop(TARGET, axis=1).to_numpy())
    test_targets = double_dim_converter(test_df[TARGET].to_numpy())

    return train_features, train_targets, test_features, test_targets

In [6]:
train_features, train_targets, test_features, test_targets = data_train_test_split(df, TEST_SIZE, SEED)

feature_scaler = STDScaler()
target_scaler = STDScaler()

feature_scaler.fit(train_features)
target_scaler.fit(train_targets)

model_metrics: dict[ModelTypes, Metrics] = {}

In [13]:
# linear
linear_pipeline = LinearModel()
linear_pipeline.build_fit(train_features, train_targets, feature_scaler, target_scaler)
metrics = calculate_err_metrics(test_targets, linear_pipeline.forward(test_features))
model_metrics[ModelTypes.LINEAR] = metrics
print(f"Test error metrics of {ModelTypes.LINEAR} regression model:", metrics, "\n", sep="\n")

# polynomial
polynomial_pipeline = PolynomialModel()
polynomial_pipeline.build_fit(train_features, train_targets, feature_scaler, target_scaler)
metrics = calculate_err_metrics(test_targets, polynomial_pipeline.forward(test_features))
model_metrics[ModelTypes.POLYNOMIAL] = metrics
print(f"Test error metrics of {ModelTypes.POLYNOMIAL} regression model:", metrics, "\n", sep="\n")

# random forrest
random_forrest_pipeline = RandomForrestModel()
random_forrest_pipeline.build_fit(train_features, train_targets, feature_scaler, target_scaler)
metrics = calculate_err_metrics(test_targets, random_forrest_pipeline.forward(test_features))
model_metrics[ModelTypes.RANDOM_FORREST] = metrics
print(f"Test error metrics of {ModelTypes.RANDOM_FORREST} regression model:", metrics, "\n", sep="\n")

# ridge
ridge_pipeline = RidgeModel()
ridge_pipeline.build_fit(train_features, train_targets, feature_scaler, target_scaler)
metrics = calculate_err_metrics(test_targets, ridge_pipeline.forward(test_features))
model_metrics[ModelTypes.RIDGE] = metrics
print(f"Test error metrics of {ModelTypes.RIDGE} regression model:", metrics, "\n", sep="\n")

# lasso
lasso_pipeline = LassoModel()
lasso_pipeline.build_fit(train_features, train_targets, feature_scaler, target_scaler)
metrics = calculate_err_metrics(test_targets, lasso_pipeline.forward(test_features))
model_metrics[ModelTypes.LASSO] = metrics
print(f"Test error metrics of {ModelTypes.LASSO} regression model:", metrics, "\n", sep="\n")

Test error metrics of ModelTypes.LINEAR regression model:
Metrics(mse=0.0005052634987468958, mae=0.014597020597226925, r2_score=0.9901937660350374)


Test error metrics of ModelTypes.POLYNOMIAL regression model:
Metrics(mse=3.3779620004102484e-06, mae=0.0011082465714446516, r2_score=0.9999344399787775)




  self._model.fit(feature_scaler.transform(feature), target_scaler.transform(target))


Test error metrics of ModelTypes.RANDOM_FORREST regression model:
Metrics(mse=1.1364874043831596e-05, mae=0.001341003696686165, r2_score=0.9997794287255412)


Test error metrics of ModelTypes.RIDGE regression model:
Metrics(mse=0.0005156030873131474, mae=0.014842693222569443, r2_score=0.9899930936634262)


Test error metrics of LASSO regression model:
Metrics(mse=0.05182340641895126, mae=0.17948229600945165, r2_score=-0.0057968752263852785)




In [8]:
# # linear regression
# linear_model = LinearRegression().fit(scaled_train_features, scaled_train_targets)

# linear_fwd_pipeline: Callable[[numpy.ndarray], numpy.ndarray] = lambda x: target_scaler.inv_transform(linear_model.predict(feature_scaler.transform(x)))

# metrics = calculate_err_metrics(test_targets, linear_fwd_pipeline(test_features))
# model_metrics[ModelTypes.LINEAR] = metrics
# print("Test error metrics of linear regression model:", metrics, sep="\n")

In [9]:
# # polynomial regression
# # Creating the higher degree features and interactions
# poly_creator = PolynomialFeatures(degree=POLYNOMIAL_DEGREE)
# poly_train_features = poly_creator.fit_transform(train_features)

# poly_feature_scaler = STDScaler()
# poly_target_scaler = STDScaler()

# poly_scaled_train_features = poly_feature_scaler.fit_transform(poly_train_features)
# poly_scaled_train_targets = poly_target_scaler.fit_transform(train_targets)

# polynomial_model = LinearRegression().fit(poly_scaled_train_features, poly_scaled_train_targets)

# poly_fwd_pipeline: Callable[[numpy.ndarray], numpy.ndarray] = lambda x: poly_target_scaler.inv_transform(polynomial_model.predict(poly_feature_scaler.transform(poly_creator.transform(x))))

# metrics = calculate_err_metrics(test_targets, poly_fwd_pipeline(test_features))
# model_metrics[ModelTypes.POLYNOMIAL] = metrics
# print("Test error metrics of polynomial regression model:", metrics, sep="\n")


In [10]:
# # random forrest regression
# random_forrest_model = RandomForestRegressor(max_depth=RANDOM_FORREST_MAX_DEPTH, random_state=SEED).fit(scaled_train_features, scaled_train_targets)

# r_forrest_fwd_pipeline: Callable[[numpy.ndarray], numpy.ndarray] = lambda x: target_scaler.inv_transform(double_dim_converter(random_forrest_model.predict(feature_scaler.transform(x))))

# metrics = calculate_err_metrics(test_targets, r_forrest_fwd_pipeline(test_features))
# model_metrics[ModelTypes.RANDOM_FORREST] = metrics
# print("Test error metrics of random forrest regression model:", metrics, sep="\n")

In [11]:
# # ridge regression
# ridge_model = Ridge(alpha=RIDGE_ALPHA).fit(scaled_train_features, scaled_train_targets)

# ridge_fwd_pipeline: Callable[[numpy.ndarray], numpy.ndarray] = lambda x: target_scaler.inv_transform(ridge_model.predict(feature_scaler.transform(x)))

# metrics = calculate_err_metrics(test_targets, ridge_fwd_pipeline(test_features))
# model_metrics[ModelTypes.RIDGE] = metrics
# print("Test error metrics of ridge regression model:", metrics, sep="\n")

In [12]:
# lasso regression
# lasso_model = Lasso(alpha=LASSO_ALPHA).fit(scaled_train_features, scaled_train_targets)

# lasso_fwd_pipeline: Callable[[numpy.ndarray], numpy.ndarray] = lambda x: target_scaler.inv_transform(double_dim_converter(lasso_model.predict(feature_scaler.transform(x))))

# metrics = calculate_err_metrics(test_targets, lasso_fwd_pipeline(test_features))
# model_metrics[ModelTypes.LASSO] = metrics
# print("Test error metrics of lasso regression model:", metrics, sep="\n")