In [None]:
#| default_exp utils.functions

# Utility functions

> More on this soon

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| export
from fastcore.utils import *
import numpy as np
import pandas as pd
import gc
import numba



In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| export

def calculate_entropy(y):
    """
    Entropy of a label sequence
    """
    hist = np.bincount(y)
    ps = hist / np.sum(hist)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])


def calculate_gini(y):
    """
    Gini impurity (local entropy) of a label sequence
    """
    hist = np.bincount(y)
    N = np.sum(hist)
    return 1 - sum([(i / N) ** 2 for i in hist])

def divide_on_feature(X, feature_i, threshold):
    """ 
    Divide dataset based on if sample value on feature index is larger than the given threshold 
    """
    split_func = None
    if isinstance(threshold, int) or isinstance(threshold, float):
        split_func = lambda sample: sample[feature_i] >= threshold
    elif isinstance(threshold,list):
        split_func = lambda sample: sample[feature_i].isin(threshold)
    else:
        split_func = lambda sample: sample[feature_i] == threshold

    # X_1 = np.array([sample for sample in X if split_func(sample)])
    # X_2 = np.array([sample for sample in X if not split_func(sample)])
    # return np.array([X_1, X_2])

    mask = split_func(X)
    X_1 = X[mask]
    X_2 = X[~mask]
    return X_1,X_2
    
def get_sorted_cats(x,ascending=True):    
    """
     Get sorted list of categorical levels in defined order
    """
    u, count = np.unique(x, return_counts=True)
    if ascending:
        count_sort_ind = np.argsort(count)
    else:
        count_sort_ind = np.argsort(-count)
    return u[count_sort_ind]


def calculate_mse(y):
    """ Returns the mean squared error between y_true and y_mean """
    mse = np.mean(np.power(y_true - np.mean(y), 2))
    return mse


def calculate_mae(y):
    """ Returns the mean absolute error between y_true and y_mean """
    mae = np.mean(np.abs(y_true - np.mean(y)))
    return mae

def calculate_rss(y):
    """ Returns the sum of residual squared error between y_true and y_mean """
    rss = np.sum((y - np.mean(y)) ** 2)
    return rss


def calculate_variance(X):
    """ Return the variance of the features in dataset X """
    mean = np.ones(np.shape(X)) * X.mean(0)
    n_samples = np.shape(X)[0]
    variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean))
    
    return variance


def calculate_std_dev(X):
    """ Calculate the standard deviations of the features in dataset X """
    std_dev = np.sqrt(calculate_variance(X))
    return std_dev


def euclidean_distance(x1, x2):
    """ Calculates the l2 distance between two vectors """
    distance = 0
    # Squared distance between each coordinate
    for i in range(len(x1)):
        distance += np.power((x1[i] - x2[i]), 2)
    return np.sqrt(distance)



def calculate_covariance_matrix(X, Y=None):
    """ Calculate the covariance matrix for the dataset X """
    if Y is None:
        Y = X
    n_samples = np.shape(X)[0]
    covariance_matrix = (1 / (n_samples-1)) * (X - X.mean(axis=0)).T.dot(Y - Y.mean(axis=0))

    return np.array(covariance_matrix, dtype=float)
 

def calculate_correlation_matrix(X, Y=None):
    """ Calculate the correlation matrix for the dataset X """
    if Y is None:
        Y = X
    n_samples = np.shape(X)[0]
    covariance = (1 / n_samples) * (X - X.mean(0)).T.dot(Y - Y.mean(0))
    std_dev_X = np.expand_dims(calculate_std_dev(X), 1)
    std_dev_y = np.expand_dims(calculate_std_dev(Y), 1)
    correlation_matrix = np.divide(covariance, std_dev_X.dot(std_dev_y.T))

    return np.array(correlation_matrix, dtype=float)

