In this document I collect all different kinds of transformations that I have coded up as funcitons for easy access and reusability

In [None]:
import pandas as pd
import sklearn

## Imputing Data
- Simple Imputer

In [None]:
# Simple Imputer
from sklearn.impute import SimpleImputer

def fit_imputer(data):
    '''
    Fit a SimpleImputer to the data.
    parameters:
        data: pd.DataFrame
            The data to fit the imputer on.
    return: 
        SimpleImputer
            The fitted imputer.
    '''
    imputer = SimpleImputer(strategy='median')
    imputer.fit(data)
    imputer.statistics_
    return imputer
def transform_imputer(imputer, data):
    '''
    Transform the data using the fitted imputer.
    parameters:
        imputer: SimpleImputer
            The fitted imputer.
        data: pd.DataFrame
            The data to transform.
    return:
        pd.DataFrame
            The transformed data.
    '''
    return imputer.transform(data)

## Encoding Categorical Data
- Ordinal Encoding
- One Hot Encoding

In [None]:
# Ordinal Encoding of Categorical data


from sklearn.preprocessing import OrdinalEncoder

def train_ordinal_encoder(data):
    '''Train an ordinal encoder on the data'''
    encoder = OrdinalEncoder()
    encoded = encoder.fit_transform(data)
    return encoder, encoded
    
def transform_ordinal_encoder(data, encoder):
    '''Transform the data using the encoder'''
    return encoder.transform(data)

In [None]:
# One hot encoding of Categorical data


from sklearn.preprocessing import OneHotEncoder

def train_1hot_encoder(data):
    '''
        The resulting encoder will encode new data based on the categories of the training data
        ---
        data: DataFrame 
            training DataFrame
        return: 
            encoder
                fitted encoder
            encoded
                DataFrame of encoded data
    '''
    encoder = OneHotEncoder()
    sparce_matrix = encoder.fit_transform(data)
    feature_names = encoder.categories_
    encoded = sparce_matrix.toarray()
    df_encoded = pd.DataFrame(encoded, columns=feature_names)
    return encoder, df_encoded

def fit_1hot_encoder(encoder, data):
    '''
        function to fit new data to a pre-trained encoder
        ---
        parameters:
            encoder: OneHotEncoder
                fitted encoder
            data: DataFrame 
                DataFrame to be encoded
        return: 
            df_encoded
                DataFrame of encoded data
    '''
    sparce_matrix = encoder.transform(data)
    feature_names = encoder.categories_
    encoded = sparce_matrix.toarray()
    df_encoded = pd.DataFrame(encoded, columns=feature_names)
    return df_encoded

## Scaling
- min-max scaling
- standard scaler
- z score scaling

In [None]:
# minmax scaler


from sklearn.preprocessing import MinMaxScaler

def train_minmax_scaler(data, min=0, max=1, clip=False):
    '''Train a minmax scaler on the data'''
    scaler = MinMaxScaler(feature_range=(min, max), clip=clip)
    scaled = scaler.fit_transform(data)
    columns = scaler.feature_names_in_
    df_scaled = pd.DataFrame(scaled, columns=columns)
    return scaler, df_scaled

def transform_minmax_scaler(data, scaler):
    '''Transform the data using the scaler'''
    scaled = scaler.transform(data)
    columns = scaler.feature_names_in_
    df_scaled = pd.DataFrame(scaled, columns=columns)
    return df_scaled


In [None]:
# standard scaler

from sklearn.preprocessing import StandardScaler

def train_standard_scaler(data):
    '''Train a standard scaler on the data'''
    scaler = StandardScaler()
    scaled = scaler.fit_transform(data)
    columns = scaler.feature_names_in_
    df_scaled = pd.DataFrame(scaled, columns=columns)
    return scaler, df_scaled
 
def transform_standard_scaler(data, scaler):
    '''Transform the data using the scaler'''
    scaled = scaler.transform(data)
    columns = scaler.feature_names_in_
    df_scaled = pd.DataFrame(scaled, columns=columns)
    return df_scaled


In [None]:
# Z-score standardization implementation
import numpy as np

def z_score_standardization(col):
    '''Normalizes a column using the z-score method to transform the data into units of standard deviations from the mean'''
    col_mean = np.mean(col)
    col_variance = np.std(col)
    return((col-col_mean)/col_variance)

## Transformations
- log transform

In [None]:
# Transformation
from sklearn.preprocessing import FunctionTransformer

def l1p(data):
    '''
    Log1p transformation of the data.
    This transformation can be more beneficial than the standard log transformation 
    as it handles zero values.
    ---
    params: data: pd.Series
    return: pd.Series
    '''
    assert data.min() < 0, 'data contains negative values at log1p transform'
    return np.log1p(data)

log1p_transformer = FunctionTransformer(func=l1p, validate=False)


In [None]:
# clustering and plotting similarities in longitueds and latitudes


from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=0.1, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f'Cluster {i} similarity' for i in range(self.n_clusters)]
    
# usage: either by explicit calls or in pipeline
#   cluster_similarity = ClusterSimilarity(n_clusters=10, gamma=0.1, random_state=42)
#   X = df_train_num[['latitude', 'longitude']]
#   weight = df_train_num['median_house_value']
#   ClusterSimilarity = cluster_similarity.fit(X, sample_weight=weight)
#   similarities = cluster_similarity.transform(X)
#   centroids = cluster_similarity.kmeans_.cluster_centers_

# corresponding graph
from matplotlib import pyplot as plt
def plot_similarities(data, similarities, centroids):
    plt.scatter(data[:0], data[:1], c=[np.max(c)for c in similarities], cmap='jet', s=20)
    plt.scatter(centroids[:, 1], centroids[:, 0], c='black', marker='x', s=200)
    plt.title('Cluster Centers and Clusters')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.show()