In [1]:
import numpy as np

class RegularizedLinearRegression:
    """
    A class for performing Regularized Linear Regression.
    
    Parameters:
    ----------
    reg_lambda: float (default=0.0)
        The regularization term to avoid overfitting.
        
    Attributes:
    ----------
    weights: ndarray
        The weights learned by the model during training.
    """
    
    def __init__(self, reg_lambda=0.0):
        self.reg_lambda = reg_lambda
        self.weights = None

    def fit(self, X, y):
        """
        Train the Regularized Linear Regression model on the input data X and target variable y.
        
        Parameters:
        ----------
        X: ndarray, shape (n_samples, n_features)
            The input training data.
        y: ndarray, shape (n_samples,)
            The target variable.
            
        Returns:
        -------
        self: object
            Returns self.
        """
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        
        identity = np.identity(n_features)
        identity[0, 0] = 0  # Set the first element to zero to exclude the bias term from regularization
        
        X_transpose = np.transpose(X)
        regularization = self.reg_lambda * identity
        
        inverse = np.linalg.inv(np.dot(X_transpose, X) + regularization)
        self.weights = np.dot(np.dot(inverse, X_transpose), y)
        
        return self

    def predict(self, X):
        """
        Predict the target variable for a new set of input data X.
        
        Parameters:
        ----------
        X: ndarray, shape (n_samples, n_features)
            The input data to make predictions on.
            
        Returns:
        -------
        y_pred: ndarray, shape (n_samples,)
            The predicted target variable.
        """
        y_pred = np.dot(X, self.weights)
        return y_pred


In [32]:
import numpy as np
from numpy.linalg import inv

class RegularizedLinearRegressionEncoder:
    
    """A class for encoding categorical variables using a regularized linear regression model.
    
    Attributes:
        alpha (float, optional): The regularization strength; must be a positive float. The smaller the value, the stronger the regularization. Defaults to 0.1.
        l1_ratio (float, optional): The balance between L1 and L2 regularization. Must be between 0 and 1. The closer to 0, the more L2 regularization is applied. The closer to 1, the more L1 regularization is applied. Defaults to 0.5.
        categories_ (List[str]): A list of the categories in the data. Set after calling the `fit` method.
        encoded_columns_ (np.ndarray): A 2D numpy array with the encoded data. Set after calling the `transform` method.
        weights_ (np.ndarray): A 1D numpy array with the weights for each category. Set after calling the `fit` method.
        intercept_ (float): The intercept value for the linear regression model. Set after calling the `fit` method.
    """
    def __init__(self, alpha=0.1, l1_ratio=0.5):
        
        """Initialize the RegularizedLinearRegressionEncoder object.
        
        Args:
            alpha (float, optional): The regularization strength; must be a positive float. The smaller the value, the stronger the regularization. Defaults to 0.1.
            l1_ratio (float, optional): The balance between L1 and L2 regularization. Must be between 0 and 1. The closer to 0, the more L2 regularization is applied. The closer to 1, the more L1 regularization is applied. Defaults to 0.5.
        """
        
        
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        self.categories_ = None
        self.encoded_columns_ = None
        self.weights_ = None
        self.intercept_ = None
        
        
        
        
        
    def fit(self, X, y):
        """
        Fit the encoder to the input data.

        Parameters
        ----------
        X : array-like, shape (n_samples,)
            The categorical data to be encoded.
        y : array-like, shape (n_samples,)
            The target values.

        Returns
        -------
        None
        """
            
        # Get the unique categories in the input data
        self.categories_ = list(set(X))
        
        # Encode the categories as one-hot vectors
        X_encoded = self.onehot_encode(X)
        
        # Add a column of ones to X_encoded to account for the intercept term
        X_encoded = np.hstack((np.ones((X_encoded.shape[0], 1)), X_encoded))
        
        # Calculate the regularization term
        reg_term = self.alpha * self.l1_ratio * np.ones(X_encoded.shape[1])
        
        # Use ridge regression to calculate the weights
        identity = np.identity(X_encoded.shape[1])
        identity[0, 0] = 0 # Do not regularize the intercept term
        self.weights_ = np.dot(inv(np.dot(X_encoded.T, X_encoded) + np.dot(reg_term, reg_term.T)), np.dot(X_encoded.T, y))
        
        # Save the encoded column names for later use
        self.encoded_columns_ = ["x_{}_{}".format(i, c) for i, c in enumerate(self.categories_)]
        
        
        
    def transform(self, X):
        """
        Encode the input categorical data.

        Parameters
        ----------
        X : array-like, shape (n_samples,)
            The categorical data to be encoded.

        Returns
        -------
        encoded : array-like, shape (n_samples,)
            The encoded target values.
        """
            
        # Encode the categories as one-hot vectors
        X_encoded = self.onehot_encode(X)
        
        # Add a column of ones to X_encoded to account for the intercept term
        X_encoded = np.hstack((np.ones((X_encoded.shape[0], 1)), X_encoded))
        
        # Calculate the predicted target values
        return np.dot(X_encoded, self.weights_)
    
    def onehot_encode(self, X):
        """
        Encode categorical data as one-hot vectors.
        
        Parameters
        ----------
        X : array-like, shape (n_samples,)
            Input data, where n_samples is the number of samples.
        
        Returns
        -------
        X_encoded : array-like, shape (n_samples, n_features)
            Encoded input data, where n_features is the number of features.
        """
        X_encoded = np.zeros((len(X), len(self.categories_)))
        for i, x in enumerate(X):
            X_encoded[i, self.categories_.index(x)] = 1
        return X_encoded
    
    def fit_transform(self, X, y):
        """
        Fit the encoding model to the input data and return the transformed data.
        
        Parameters
        ----------
        X : array-like, shape (n_samples,)
            Input data, where n_samples is the number of samples.
        y : array-like, shape (n_samples,)
            Target values.
        
        Returns
        -------
        X_encoded : array-like, shape (n_samples, n_features)
            Encoded input data, where n_features is the number of features.
        """
        self.fit(X, y)
        return self.transform(X)


In [33]:
import numpy as np

# Generate sample data
np.random.seed(0)
X = ["A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D"] * 3
y = np.random.normal(10, 1, len(X)) + [1, 2, 3, 4] * 9

# Fit the encoder to the data
encoder = RegularizedLinearRegressionEncoder()
encoder.fit(X, y)

# Transform the data
encoded_y = encoder.transform(X)

# Print the original data and the encoded data
print("Original data:")
print(X)
print("\nEncoded data:")
print(encoded_y)


Original data:
['A', 'B', 'C', 'D', 'A', 'B', 'C', 'D', 'A', 'B', 'C', 'D', 'A', 'B', 'C', 'D', 'A', 'B', 'C', 'D', 'A', 'B', 'C', 'D', 'A', 'B', 'C', 'D', 'A', 'B', 'C', 'D', 'A', 'B', 'C', 'D']

Encoded data:
[11.6828074  11.82642331 13.39411454 14.29206119 11.6828074  11.82642331
 13.39411454 14.29206119 11.6828074  11.82642331 13.39411454 14.29206119
 11.6828074  11.82642331 13.39411454 14.29206119 11.6828074  11.82642331
 13.39411454 14.29206119 11.6828074  11.82642331 13.39411454 14.29206119
 11.6828074  11.82642331 13.39411454 14.29206119 11.6828074  11.82642331
 13.39411454 14.29206119 11.6828074  11.82642331 13.39411454 14.29206119]
