In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [24]:
df = pd.read_csv('PreProcessingAnomaly/df_train.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-1.139558,-2.663727,3.371479,4.353534,1.482218,-0.579998,-0.614653
1,-1.008677,-1.475652,-0.15421,-0.111862,-0.067099,-0.579998,-0.614653
2,-0.877796,-1.428876,-0.193579,-0.111862,-0.067099,-0.579998,-0.614653
3,-0.746916,-1.397691,-0.269529,-0.111862,-0.067099,-0.579998,-0.614653
4,-0.616035,-1.392406,-0.236393,-0.111862,-0.067099,-0.579998,-0.614653


In [25]:
def split_k_folds(df, k):
    k_size = int(len(df)/k)

    k_folds = []
    for i in range(k):
        k_folds.append(df[i*k_size:(i+1)*k_size])

    # Append any remaining rows to the last fold
    k_folds[-1] = k_folds[-1].append(df[k*k_size:])

    return k_folds

k_folds = split_k_folds(df, 3)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [61]:
# Models used:
# 1. Isolation Forest
# 2. Local Outlier Factor
# 3. One Class SVM
# 4. Robust Covariance Estimation

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope

def create_models(df, k_folds, method):
    """
    Creates a parent model using the algorithm in method and k-child models using the same algorithm.
    Parent model is trained on all data except.
    Child models are trained on each k-fold of data.

    Parameters:
    df: Pandas DataFrame
        The entire dataset
    k_folds: List
        List of k-folds of the dataset
    method: sklearn model
        The model to be used

    Returns:
    parent_model: sklearn model
        The parent model
    child_models: List
        List of k child models
    """
    parent_model = method
    parent_model.fit(df)

    child_models = []
    for i in range(len(k_folds)):
        child_models.append(method)
        child_models[i].fit(k_folds[i])

    return parent_model, child_models

In [105]:
class Models:
    def __init__(self, df, k_folds, methods):
        self.df = df
        self.k_folds = k_folds
        self.methods = methods

    def instantiate_models(self):
        """ 
        Instantiates all models and child models
        """
        models = {}
        for method in self.methods:
            # Create string name of model
            model_name = str(method).split('(')[0]
            parent_model, child_models = create_models(self.df, self.k_folds, method)
            models[model_name] = {
                'parent_model': parent_model,
                'child_models': child_models,
                'weights': 1
            }
        self.models = models


Determinant has increased; this should not happen: log(det) > log(previous_det) (-185.375622163604703 > -185.840273408883661). You may want to try with a higher value of support_fraction (current value: 0.898).


Determinant has increased; this should not happen: log(det) > log(previous_det) (-185.375622163604703 > -187.833162049463084). You may want to try with a higher value of support_fraction (current value: 0.898).


Determinant has increased; this should not happen: log(det) > log(previous_det) (-185.375622163604703 > -185.793870461872359). You may want to try with a higher value of support_fraction (current value: 0.898).


Determinant has increased; this should not happen: log(det) > log(previous_det) (-185.375622163604703 > -189.760122787115023). You may want to try with a higher value of support_fraction (current value: 0.898).



The CICS is the weighted average of the Internal Consensus Score (ICS) as expressed:

$F_x^c = \frac{1}{M}\sum_{i=1}^M I_x^i * w_i$

where $M$ is the number of models, $I_x^i$ is the ICS of model $i$ for the $x$-th data point, and $w_i$ is the weight of model $i$.

The ICS expressed is inspired by Bagging approach in machine learning. The training data is split randomly into k-folds. A k-child models are created for each model in the ensemble. The k-child models are trained each with one separate fold out of the k-fold training data. The votes a data point receives from the k-child models are termed as the Internal Consensus Vote (ICV). A data point is considered an inlier by a model if it has 1 or more ICV.

$I_x = \frac{1}{k}\sum_{i=1}^k v$

where $v$ is the number of votes x recieved as an inlier from the k-child models.

In [166]:
# Compute internal consensus score
def compute_icv_ics(child_models, data_point):
    """
    Computes the internal consensus vote (ICV) and internal consensus score (ICS) for each data point in the training data.

    Parameters:
    child_models: List
        List of child models
    data_point: Pandas DataFrame row
        A single row of the training data

    Returns:
    icv: int

    """
    votes = []
    for child_model in child_models:
        votes.append(child_model.predict([data_point]))

    # Compute internal consensus score
    ics = sum(votes) / len(votes)

    if ics >= 0:
        icv = 1
    else:
        icv = 0

    return icv, ics


def compute_cics(models, data_point):
    """
    Computes the consensus internal consensus score (CICS) for each data point in the training data.

    Parameters:
    models: List
        List of models
    data_point: Pandas DataFrame row
        A single row of the training data

    Returns:
    ICV: List
        List of internal consensus votes
    CICS: float
        The consensus internal consensus score
    """
    # Compute internal consensus score for each child model
    combined_score = []
    ICV = []
    total_weights = 0
    for model in models:
        icv, ics = compute_icv_ics(models[model]['child_models'], data_point)
        ICV.append(icv)
        combined_score.append(ics * models[model]['weights'])
        total_weights += models[model]['weights']

    CICS = sum(combined_score) / len(models)

    return ICV, CICS


def compute_cecs(models, data_point):
    """
    Computes the consensus external consensus score (CECS) for each data point in the training data.

    Parameters:
    models: List
        List of models
    data_point: Pandas DataFrame row
        A single row of the training data

    Returns:
    CECV: int
        The consensus external consensus vote
    CECS: float
        The consensus external consensus score
    """
    # Compute external consensus score for each model
    combined_score = []
    for model in models:
        # Predict data point using parent model
        ecs = models[model]['parent_model'].predict([data_point])

        combined_score.append(ecs)

    CECS = sum(combined_score) / len(models)

    if CECS >= 0:
        CECV = 1
    else:
        CECV = 0

    return CECV, CECS


def calculate_weights(CECV_all, ICV_all, models):
    """
    Calculates the weights for each model.

    Parameters:
    CECV_all: List
        List of consensus external consensus votes
    ICV_all: List of Lists
        List of internal consensus votes
    models: Models object
        Models object
    
    Returns:
    models: Models object
        Updated models object
    """
    model_names = list(models.models.keys())

    errors = np.zeros(len(model_names))

    for xi in range(len(CECV_all)):
        Vi = CECV_all[xi]
        ICV = ICV_all[xi]
        for vi in ICV:
            if vi != Vi:
                errors[ICV.index(vi)] += 1
    print('\n --------------------- \nUpdating weights\n ---------------------')

    for model in model_names:
        # w_f = w_i * (e / n) * w_i
        weight_i = models.models[model]['weights']
        error_i = errors[model_names.index(model)]
        n = len(CECV_all)
        weight_f = weight_i - (error_i / n) * weight_i
        
        print('Model {} performance: {}/{}. Weight: {} -> {}'.format(model, n - error_i, n, weight_i, weight_f))

        models.models[model]['weights'] = weight_f


    return models
    

def train_ensemble(models, df_train):
    """
    Iterates through the training data and compute the CICS, CECS and ICV for each data point. 
    The weights for each model are then updated.
    """
    print('Training ensemble...')
    ICV_all = []
    CECV_all = []

    for i in range(len(df_train)):
    # for i in range(20):
        print('Training data point: {}/{}'.format(i+1, len(df_train)), end='\r')

        data_point = df_train.iloc[i]
        ICV, CICS = compute_cics(models.models, data_point)
        ICV_all.append(ICV)
        CECV, CECS = compute_cecs(models.models, data_point)
        CECV_all.append(CECV)

    # Update weights
    models = calculate_weights(CECV_all, ICV_all, models)
    print('Training complete. \n ------------------')
    return models


def perform_CNDE(models, df_train):
    """
    Performs the CNDE algorithm on the training data.

    Parameters:
    models: List
        List of models
    df_train: Pandas DataFrame
        Training data

    Returns:
    models: List
        List of models
    """
    # Train ensemble
    models = train_ensemble(models, df_train)

    normality_scores = []
    all_CICS = []
    all_CECS = []
    # Calculate normality score
    for i in range(len(df_train)):
    # for i in range(20):
        print('Calculating normality score: {}/{}'.format(i+1, len(df_train)), end='\r')
        data_point = df_train.iloc[i]
        ICV, CICS = compute_cics(models.models, data_point)
        CECV, CECS = compute_cecs(models.models, data_point)

        all_CICS.append(CICS)
        all_CECS.append(CECS)

        # Calculate normality score
        N = (CICS + CECS) / 2
        normality_scores.append(N)

    models.normality_scores = normality_scores
    models.all_CICS = all_CICS
    models.all_CECS = all_CECS
    models.weights = [models.models[model]['weights'] for model in models.models]
    return models

In [194]:
methods = [
    IsolationForest(contamination=0.1),
    LocalOutlierFactor(novelty=True, contamination=0.1),
    OneClassSVM(nu=0.1),
    EllipticEnvelope(support_fraction=0.95, contamination=0.1)
]
models = Models(df, k_folds, methods)
models.instantiate_models()

models = perform_CNDE(models, df)

Training ensemble...
Training data point: 1477/1477
 --------------------- 
Updating weights
 ---------------------
Model IsolationForest performance: 1193.0/1477. Weight: 1 -> 0.8077183480027081
Model LocalOutlierFactor performance: 1346.0/1477. Weight: 1 -> 0.9113067027758971
Model OneClassSVM performance: 1419.0/1477. Weight: 1 -> 0.960731211916046
Model EllipticEnvelope performance: 1443.0/1477. Weight: 1 -> 0.976980365605958
Training complete. 
 ------------------
Calculating normality score: 1477/1477

In [195]:
# List of arrays to single array
def flatten_list(l):
    """
    Flattens a list of arrays into a single array.

    Parameters:
    l: List
        List of arrays

    Returns:
    flat_list: List
        Single array
    """
    flat_list = []
    for sublist in l:
        for item in sublist:
            flat_list.append(item)
    return flat_list

normality_scores = flatten_list(models.normality_scores)
# Scale normality scores to range 0-1
normality_scores = (normality_scores - min(normality_scores)) / (max(normality_scores) - min(normality_scores))
print('Normality scores: {}'.format(normality_scores))

Normality scores: [0. 1. 1. ... 1. 1. 0.]


In [197]:
df_norm = pd.DataFrame(normality_scores, columns=['Normality score'])

In [200]:
import plotly.graph_objects as go

# Plot normality scores
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_norm.index,
    y=df_norm['Normality score'],
    mode='markers',
    name='Normality Scores',
    marker=dict(
        size=3,
    )
))

mean = np.mean(df_norm['Normality score'])
std = np.std(df_norm['Normality score'])
# Plot standard deviation lines
fig.add_trace(go.Scatter(
    x=[0, len(df_norm)],
    y=[mean - std, mean - std],
    mode='lines',
    name='Mean - std',
    line=dict(
        color='orange',
        width=1,
        dash='dash'
    )
))

fig.add_trace(go.Scatter(
    x=[0, len(df_norm)],
    y=[mean - 2* std, mean -2* std],
    mode='lines',
    name='Mean + std',
    line=dict(
        color='red',
        width=1,
        dash='dash'
    )
))

# Update layout
fig.update_layout(
    title='Normality scores',
    xaxis_title='Data point',
    yaxis_title='Normality score',
    font=dict(
        color="#7f7f7f"
    )
)

fig.show()