In [6]:
from CNDE import Models
from CNDE import perform_CNDE, test_ensemble

# Turn off user warnings
import warnings
warnings.filterwarnings('ignore')

train_path = 'df_train_other.csv'
test_path = 'df_test_other.csv'
contamination = 0.05

print(f'Model1 begins: (k=3, contamination={contamination})')
Model1 = Models(train_path, k=3, contamination=contamination)
Model1.instantiate_models()
Model1 = perform_CNDE(Model1)
Model1 = test_ensemble(Model1, test_path)

print('____________________________________\n')

print(f'Model2 begins: (k=3, contamination={contamination})')
Model2 = Models(train_path, k=3, contamination=contamination)
Model2.instantiate_models()
Model2 = perform_CNDE(Model2)
Model2 = test_ensemble(Model2, test_path)

print('____________________________________\n')

print(f'Model3 begins: (k=5, contamination={contamination})')
Model3 = Models(train_path, k=5, contamination=contamination)
Model3.instantiate_models()
Model3 = perform_CNDE(Model3)
Model3 = test_ensemble(Model3, test_path)

print('____________________________________\n')

print(f'Model4 begins: (k=5, contamination={contamination})')
Model4 = Models(train_path, k=5, contamination=contamination)
Model4.instantiate_models()
Model4 = perform_CNDE(Model4)
Model4 = test_ensemble(Model4, test_path)


Model1 begins: (k=3, contamination=0.05)
Training ensemble...
Training data point: 1440/1440
 --------------------- 
Updating weights
 ---------------------
Model IsolationForest performance: 1373.0/1440. Weight: 1 -> 0.9534722222222223
Model LocalOutlierFactor performance: 1395.0/1440. Weight: 1 -> 0.96875
Model OneClassSVM performance: 1407.0/1440. Weight: 1 -> 0.9770833333333333
Model EllipticEnvelope performance: 1428.0/1440. Weight: 1 -> 0.9916666666666667
Training complete. 
------------------
------------------ 

Model IsolationForest ECS: [0.71105555 0.81758435 0.89847582 ... 0.55392409 0.88794223 0.79268454]
Model IsolationForest CICS: [0.71105555 0.81758435 0.89847582 ... 0.55392409 0.88794223 0.79268454]
Model LocalOutlierFactor ECS: [0.99275841 0.9946252  0.99539594 ... 0.97122767 0.99449988 0.99016022]
Model LocalOutlierFactor CICS: [0.99275841 0.9946252  0.99539594 ... 0.97122767 0.99449988 0.99016022]
Model OneClassSVM ECS: [0.83092198 0.84052955 0.84519783 ... 0.8601192

In [26]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd

def compile_and_save(models, description, type):
    scores_list = [model.normality_scores for model in models]

    scores = np.zeros(len(scores_list[0]))
    for score in scores_list:
        score = np.array(score)
        scores += score

    scores = scores / len(scores_list)

    # Normalize scores
    scores = (scores - np.min(scores)) / (np.max(scores) - np.min(scores))

    path_train = f'Unscaled/df_train_{type}.csv'
    path_test = f'Unscaled/df_test_{type}.csv'

    df_unscaled_train = pd.read_csv(path_train)
    train_scores = scores[:len(df_unscaled_train)]
    df_unscaled_test = pd.read_csv(path_test)

    # Concatenate train and test
    df_unscaled_both = pd.concat([df_unscaled_train, df_unscaled_test], axis=0)
    df_unscaled_both['normality_score'] = scores

    # Get new indices
    df_unscaled_both = df_unscaled_both.reset_index(drop=True)
    df_unscaled_both['Start time'] = pd.to_datetime(df_unscaled_both['Start time'])
    df_unscaled_both = df_unscaled_both.set_index('Start time')


    # Save to csv
    df_unscaled_both.to_csv(f'Results/df_{type}_{description}.csv', index=False)

    return df_unscaled_both, train_scores


In [30]:
df_unscaled_both, train_scores = compile_and_save([Model1, Model2, Model3, Model4], 'c005', 'other')

import plotly.express as px

scores = df_unscaled_both['normality_score']

mean = np.mean(train_scores)
std = np.std(train_scores)

# Plot normality scores from df_unscaled_both against index. Hover data for each point: all columns in df_unscaled_both
fig = px.scatter(df_unscaled_both, 
                 x=df_unscaled_both.index, 
                 y=df_unscaled_both['normality_score'],
                 hover_data=df_unscaled_both.columns)

fig.update_traces(marker=dict(size=3))

fig.update_layout(title='Normality Scores', xaxis_title='Time', yaxis_title='Normality Score')

# Include mean, -1 std, -2 std, -3 std
fig.add_trace(go.Scatter(x=list(range(len(scores))), y=[mean for i in range(len(scores))], mode='lines', name='Mean'))
fig.add_trace(go.Scatter(x=list(range(len(scores))), y=[mean - std for i in range(len(scores))], mode='lines', name='-Sigma'))
fig.add_trace(go.Scatter(x=list(range(len(scores))), y=[mean - 2* std for i in range(len(scores))], mode='lines', name='-2 Sigma'))
fig.add_trace(go.Scatter(x=list(range(len(scores))), y=[mean - 3* std for i in range(len(scores))], mode='lines', name='-3 Sigma'))

lower_x = df_unscaled_both.index[0] - pd.Timedelta(days=1)
upper_x = df_unscaled_both.index[-1] + pd.Timedelta(days=1)
fig.update_xaxes(range=[lower_x, upper_x])


fig.show()

In [75]:
from plotly.subplots import make_subplots

df_unscaled_train = pd.read_csv('Unscaled/df_train_other.csv')

def plot_scores(Model1, score_type='ecs'):
    ECS_scores = []
    CICS_scores = []
    normality_scores = Model1.normality_scores
    model_names = []
    weights = []
    for model in Model1.models:
        model_names.append(model)
        ECS_scores.append(Model1.models[model]['ecs'])
        CICS_scores.append(Model1.models[model]['cics'])
        weights.append(Model1.models[model]['weights'])


    # Only record model weights to 2 decimal places
    plot_names = [f'{model_names[i]} (w={weights[i]:.2f})' for i in range(len(model_names))]
    # Plot ECS scores
    fig = make_subplots(rows=2, cols=2, subplot_titles=plot_names)

    for i, model in enumerate(model_names):
        if score_type == 'ecs':
            train_scores = ECS_scores[i][:len(df_unscaled_train)]
            test_scores = ECS_scores[i][len(df_unscaled_train):]
        elif score_type == 'cics':
            train_scores = CICS_scores[i][:len(df_unscaled_train)]
            test_scores = CICS_scores[i][len(df_unscaled_train):]


        # mean and std
        mean = np.mean(train_scores)
        std = np.std(train_scores)
        
        # Any values below -4 std are set to -4 std
        train_scores[train_scores < mean - 4*std] = mean - 4*std
        test_scores[test_scores < mean - 4*std] = mean - 4*std

        fig.add_trace(go.Histogram(x=train_scores, nbinsx=100, marker=dict(color='blue'), showlegend=False,
                                name='Train Scores'), row=i//2+1, col=i%2+1)
        fig.add_trace(go.Histogram(x=test_scores, nbinsx=100, marker=dict(color='red'), showlegend=False,
                                    name='Test Scores'), row=i//2+1, col=i%2+1)
        
    fig.show()


In [79]:
plot_scores(Model1, score_type='ecs')

plot_scores(Model1, score_type='cics')

In [80]:
# Plot distribution of normality scores as scatter plot
train_scores = Model1.normality_scores[:len(df_unscaled_train)]
test_scores = Model1.normality_scores[len(df_unscaled_train):]

fig = go.Figure()
fig.add_trace(go.Histogram(x=train_scores, nbinsx=100, marker=dict(color='blue'), showlegend=False,
                            name='Train Scores'))
fig.add_trace(go.Histogram(x=test_scores, nbinsx=100, marker=dict(color='red'), showlegend=False,
                            name='Test Scores'))
fig.update_layout(title='Distribution of Normality Scores', xaxis_title='Normality Score', yaxis_title='Count')
fig.show()

In [50]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope

from plotly.subplots import make_subplots

train = pd.read_csv('df_train_other.csv')
test = pd.read_csv('df_test_other.csv')
contamination = 0.05

clf = []
# Create a model
clfs = [LocalOutlierFactor(n_neighbors=20, contamination=contamination, novelty=True), IsolationForest(contamination=contamination), OneClassSVM(nu=contamination), EllipticEnvelope(support_fraction=0.95, contamination=contamination)]

fig = make_subplots(rows=2, cols=2, subplot_titles=['LocalOutlierFactor', 'IsolationForest', 'OneClassSVM', 'EllipticEnvelope'])

for i, clf in enumerate(clfs):
    # Fit the model
    clf.fit(train)

    # Score the train data
    train_scores = clf.decision_function(train)
    test_scores = clf.decision_function(test)

    # Nomralize scores to [0, 1]
    all_scores = np.concatenate((train_scores, test_scores))
    # train_scores = (train_scores - np.min(all_scores)) / (np.max(all_scores) - np.min(all_scores))
    # test_scores = (test_scores - np.min(all_scores)) / (np.max(all_scores) - np.min(all_scores))
    
    # # If value below -5 set to -5
    # train_scores[train_scores < -5] = -5
    # test_scores[test_scores < -5] = -5

    # mean and std
    mean = np.mean(train_scores)
    std = np.std(train_scores)
    
    # Any values below -4 std are set to -4 std
    train_scores[train_scores < mean - 4*std] = mean - 4*std
    test_scores[test_scores < mean - 4*std] = mean - 4*std

    fig.add_trace(go.Histogram(x=train_scores, nbinsx=100, marker=dict(color='blue'), showlegend=False,
                               name='Train Scores'), row=i//2+1, col=i%2+1)
    fig.add_trace(go.Histogram(x=test_scores, nbinsx=100, marker=dict(color='red'), showlegend=False,
                               name='Test Scores'), row=i//2+1, col=i%2+1)
fig.show() 

# Save figure
fig.write_image('Results/demo.png')




n_neighbors (20) is greater than the total number of samples (12). n_neighbors will be set to (n_samples - 1) for estimation.


n_neighbors (20) is greater than the total number of samples (12). n_neighbors will be set to (n_samples - 1) for estimation.


n_neighbors (20) is greater than the total number of samples (13). n_neighbors will be set to (n_samples - 1) for estimation.



Training ensemble...
Training data point: 37/37
 --------------------- 
Updating weights
 ---------------------
Model IsolationForest performance: 34.0/37. Weight: 1 -> 0.9189189189189189
Model LocalOutlierFactor performance: 36.0/37. Weight: 1 -> 0.972972972972973
Model OneClassSVM performance: 14.0/37. Weight: 1 -> 0.3783783783783784
Model EllipticEnvelope performance: 34.0/37. Weight: 1 -> 0.9189189189189189
Training complete. 
 ------------------
Calculating normality score: 37/37


n_neighbors (20) is greater than the total number of samples (7). n_neighbors will be set to (n_samples - 1) for estimation.


n_neighbors (20) is greater than the total number of samples (7). n_neighbors will be set to (n_samples - 1) for estimation.


n_neighbors (20) is greater than the total number of samples (7). n_neighbors will be set to (n_samples - 1) for estimation.


n_neighbors (20) is greater than the total number of samples (7). n_neighbors will be set to (n_samples - 1) for estimation.


n_neighbors (20) is greater than the total number of samples (9). n_neighbors will be set to (n_samples - 1) for estimation.



Training ensemble...
Training data point: 37/37
 --------------------- 
Updating weights
 ---------------------
Model IsolationForest performance: 24.0/37. Weight: 1 -> 0.6486486486486487
Model LocalOutlierFactor performance: 31.0/37. Weight: 1 -> 0.8378378378378378
Model OneClassSVM performance: 21.0/37. Weight: 1 -> 0.5675675675675675
Model EllipticEnvelope performance: 35.0/37. Weight: 1 -> 0.9459459459459459
Training complete. 
 ------------------
Calculating normality score: 37/37

In [None]:
scores1 = sleep1.normality_scores
scores1 = [score[0] for score in scores1]

scores2 = sleep2.normality_scores
scores2 = [score[0] for score in scores2]

scores3 = sleep3.normality_scores
scores3 = [score[0] for score in scores3]

# Average scores
scores1 = np.array(scores1)
scores2 = np.array(scores2)
scores3 = np.array(scores3)
scores = (scores1 + scores2 + scores3) / 3

# Normalize scores
# scores = (scores - np.min(scores)) / (np.max(scores) - np.min(scores))

# Mean and std
mean = np.mean(scores)
std = np.std(scores)

# Plot normality scores
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(scores))), y=scores, mode='markers', name='Normality Scores'))
fig.add_trace(go.Scatter(x=list(range(len(scores))), y=[mean for i in range(len(scores))], mode='lines', name='Mean'))
fig.add_trace(go.Scatter(x=list(range(len(scores))), y=[mean - std for i in range(len(scores))], mode='lines', name='-Sigma'))
fig.add_trace(go.Scatter(x=list(range(len(scores))), y=[mean - 2* std for i in range(len(scores))], mode='lines', name='-2 Sigma'))
fig.add_trace(go.Scatter(x=list(range(len(scores))), y=[mean - 3* std for i in range(len(scores))], mode='lines', name='-3 Sigma'))

fig.update_layout(title='Normality Scores', xaxis_title='Time', yaxis_title='Normality Score')

fig.show()

