In [72]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from CNDE import Models
from CNDE import perform_CNDE, test_ensemble
import pandas as pd
import numpy as np
import pickle

# Turn off user warnings
import warnings
warnings.filterwarnings('ignore')

train_path = 'df_train_frequency.csv'
test_path = 'df_test_frequency.csv'
contamination = 0.05
for i in range(1, 3):
    # print(f'Model_frequency begins: (k=3, contamination={contamination})')
    # Model_frequency = Models(train_path, k=3, contamination=contamination)
    # Model_frequency.instantiate_models()
    # Model_frequency = perform_CNDE(Model_frequency)
    # Model_frequency = test_ensemble(Model_frequency, test_path)

    train_path = 'df_train_sleep.csv'
    test_path = 'df_test_sleep.csv'
    contamination = 0.05

    print(f'Model_sleep begins: (k=3, contamination={contamination})')
    Model_sleep = Models(train_path, k=3, contamination=contamination)
    Model_sleep.instantiate_models()
    Model_sleep = perform_CNDE(Model_sleep)
    Model_sleep = test_ensemble(Model_sleep, test_path)

    # Save the models
    # with open(f'Model_frequency_{i}.pkl', 'wb') as f:
    #     pickle.dump(Model_frequency, f)

    with open(f'Model_sleep_{i}.pkl', 'wb') as f:
        pickle.dump(Model_sleep, f)


Model_sleep begins: (k=3, contamination=0.05)
Training ensemble...
Training data point: 35/35
 --------------------- 
Updating weights
 ---------------------
Model IsolationForest performance: 33.0/35. Weight: 1 -> 0.9428571428571428
Model LocalOutlierFactor performance: 34.0/35. Weight: 1 -> 0.9714285714285714
Model OneClassSVM performance: 22.0/35. Weight: 1 -> 0.6285714285714286
Model EllipticEnvelope performance: 35.0/35. Weight: 1 -> 1.0
Training complete. 
------------------
------------------ 

Model IsolationForest ECS: [ 1.11739486  1.33739024  0.27519457 -2.15762287 -0.82732228  0.08725251
 -0.2528654   1.34051498  0.46149468  0.89056272 -0.70874236 -1.30667663
 -0.63901716 -0.64615913  0.74919218  0.7045495   0.25342738 -1.23640748
 -0.8389187   0.86844467  0.12742604  0.69239047 -0.75913612 -0.8332485
  0.7038561  -1.80645045  0.62982102 -0.38250629  0.91196611  1.11602439
  0.76970049  1.17363435  0.73531035  1.43556189  1.37366643  1.19344918
  0.91494541 -0.94833092 -1.3

58

In [103]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from CNDE import Models
import pickle
from CNDE import perform_CNDE, test_ensemble
import pandas as pd
import numpy as np

def plot_scores(Model1, Model2, score_type='ecs', normalise=True, name=None):
    type = name.split('_')[1]
    df_unscaled_train = pd.read_csv(f'Unscaled/df_train_{type}.csv')
    
    ECS_scores1 = []
    ECS_scores2 = []
    model_names = []
    weights = []
    for model in Model1.models:
        model_names.append(model)
        ECS_scores1.append(Model1.models[model]['ecs'])
        ECS_scores2.append(Model2.models[model]['ecs'])
        weights.append((Model1.models[model]['weights'] + Model2.models[model]['weights'])/2)

    # Normality scores: concat model scores and normalise
    # normality_scores = np.concatenate((Model1.normality_scores, Model2.normality_scores))

    # Only record model weights to 2 decimal places
    plot_names = [f'{model_names[i]} (weighting: {round(weights[i], 2)})' for i in range(len(model_names))]
    # Plot ECS scores
    fig = make_subplots(rows=2, cols=2, subplot_titles=plot_names, vertical_spacing=0.2, horizontal_spacing=0.1)

    for i, model in enumerate(model_names):
        if score_type == 'ecs':
            train_scores1 = ECS_scores1[i][:len(df_unscaled_train)]
            test_scores1 = ECS_scores1[i][len(df_unscaled_train):]
            train_scores2 = ECS_scores2[i][:len(df_unscaled_train)]
            test_scores2 = ECS_scores2[i][len(df_unscaled_train):]

        if normalise:
            # Concatenate train and test scores
            all_scores = np.concatenate((train_scores1, test_scores1, train_scores2, test_scores2))
            # Standardise scores
            train_scores1 = (train_scores1 - np.mean(all_scores))/np.std(all_scores)
            test_scores1 = (test_scores1 - np.mean(all_scores))/np.std(all_scores)
            train_scores2 = (train_scores2 - np.mean(all_scores))/np.std(all_scores)
            test_scores2 = (test_scores2 - np.mean(all_scores))/np.std(all_scores)


        # mean and std
        mean = np.mean(train_scores1)
        std = np.std(train_scores1)
        
        # Any values below -4 std are set to -4 std
        # train_scores1[train_scores2 < mean - 4*std] = mean - 4*std
        # test_scores1[test_scores2 <  mean - 4*std] = mean - 4*std

        if i == 0:
            legend = True
        else:
            legend = False

        train_scores = np.concatenate((train_scores1, train_scores2))
        test_scores = np.concatenate((test_scores1, test_scores2))

        fig.add_trace(go.Histogram(x=train_scores, nbinsx=25, marker=dict(color='blue'), showlegend=legend,
                                name='Train Scores'), row=i//2+1, col=i%2+1)

        fig.add_trace(go.Histogram(x=test_scores, nbinsx=25, marker=dict(color='red'), showlegend=legend,
                                    name='Test Scores'), row=i//2+1, col=i%2+1)
        
        
        # axis labels
        fig.update_xaxes(title_text='Normality Score', row=i//2+1, col=i%2+1)
        fig.update_yaxes(title_text='Frequency', row=i//2+1, col=i%2+1)
    
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.07,
        xanchor="right",
        x=1
    ))
    fig.update_layout(margin=dict(t=20, r=20))


    # Share x and y axes
    fig.update_xaxes(matches='x')

    fig.show()
    fig.write_image(f'Results/model_dists_{name}.png', width=1200, height=500, scale=4)
    

Model_freq_1 = pickle.load(open('Models/Model_frequency_1.pkl', 'rb'))
Model_freq_2 = pickle.load(open('Models/Model_frequency_2.pkl', 'rb'))
norm_scores_freq = plot_scores(Model_freq_1, Model_freq_2, score_type='ecs', normalise=True, name='normalised_frequency')

Model_sleep_1 = pickle.load(open('Models/Model_sleep_1.pkl', 'rb'))
Model_sleep_2 = pickle.load(open('Models/Model_sleep_2.pkl', 'rb'))
norm_scores_sleep = plot_scores(Model_sleep_1, Model_sleep_2, score_type='ecs', normalise=True, name='normalised_sleep')

In [102]:
def plot_dists(Model1, Model2, type):
    df_unscaled_train = pd.read_csv(f'Unscaled/df_train_{type}.csv')
    train_length = len(df_unscaled_train)

    normality_scores1 = Model1.normality_scores
    normality_scores2 = Model2.normality_scores

    train_scores1 = normality_scores1[:train_length]
    train_scores2 = normality_scores2[:train_length]
    test_scores1 = normality_scores1[train_length:]
    test_scores2 = normality_scores2[train_length:]

    train_scores = np.concatenate((train_scores1, train_scores2))
    test_scores = np.concatenate((test_scores1, test_scores2))
    all_scores = np.concatenate((train_scores, test_scores))

    # Standardise scores
    train_scores = (train_scores - np.mean(all_scores))/np.std(all_scores)
    test_scores = (test_scores - np.mean(all_scores))/np.std(all_scores)

    # mean and std
    mean = np.mean(train_scores)
    std = np.std(train_scores)

    # Any values below -4 std are set to -4 std
    # train_scores[train_scores < mean - 4*std] = mean - 4*std
    # test_scores[test_scores < mean - 4*std] = mean - 4*std

    fig = go.Figure()
    fig.add_trace(go.Histogram(x=train_scores, nbinsx=65, marker=dict(color='blue'), showlegend=True,
                                name='Train Scores'))
    fig.add_trace(go.Histogram(x=test_scores, nbinsx=65, marker=dict(color='red'), showlegend=True,
                                name='Test Scores'))
    fig.update_layout(xaxis_title='Normality Score', yaxis_title='Frequency', legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.01,
            xanchor="right",
            x=1
        ))
    fig.update_layout(margin=dict(t=20, r=20))
    fig.show()

    fig.write_image(f'Results/model_dists_{type}.png', width=1200, height=400, scale=4)

plot_dists(Model_freq_1, Model_freq_2, type='frequency')
plot_dists(Model_sleep_1, Model_sleep_2, type='sleep')

In [94]:
# Measure skew and kurtosis
import scipy.stats as stats

combined_norm_freq = (Model_freq_1.normality_scores + Model_freq_2.normality_scores)/2
combined_norm_sleep = (Model_sleep_1.normality_scores + Model_sleep_2.normality_scores)/2

# Skew
print('Skewness of normality scores for sleep data:', stats.skew(combined_norm_sleep))
print('Kurtosis of normality scores for sleep data:', stats.kurtosis(combined_norm_sleep))

# Kurtosis
print('Skewness of normality scores for frequency data:', stats.skew(combined_norm_freq))
print('Kurtosis of normality scores for frequency data:', stats.kurtosis(combined_norm_freq))

Skewness of normality scores for sleep data: -3.786973846384544
Kurtosis of normality scores for sleep data: 16.240067500230662
Skewness of normality scores for frequency data: -1.8702072304945343
Kurtosis of normality scores for frequency data: 5.125378985994155


In [59]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import pickle

# Turn off user warnings
import warnings
warnings.filterwarnings('ignore')


df_unscaled_train = pd.read_csv('Unscaled/df_train_other.csv')

def compile_and_save(model, type):
    scores = model.normality_scores
    print(len(scores))

    path_train = f'Unscaled/df_train_{type}.csv'
    path_test = f'Unscaled/df_test_{type}.csv'

    df_unscaled_train = pd.read_csv(path_train)
    train_scores = scores[:len(df_unscaled_train)]
    df_unscaled_test = pd.read_csv(path_test)

    # Concatenate train and test
    df_unscaled_both = pd.concat([df_unscaled_train, df_unscaled_test], axis=0)
    df_unscaled_both['normality_score'] = scores

    # Get new indices
    df_unscaled_both = df_unscaled_both.reset_index(drop=True)
    df_unscaled_both['Start time'] = pd.to_datetime(df_unscaled_both['Start time'])
    df_unscaled_both = df_unscaled_both.set_index('Start time')


    # Save to csv
    df_unscaled_both.to_csv(f'Results/df_{type}.csv', index=False)

    return df_unscaled_both, train_scores

# df_unscaled_both, train_scores1 = compile_and_save(Model_other, 'other')

In [101]:
import plotly.express as px
import plotly.graph_objects as go

def plot_all_scores(type, scale):
    Model0 = pickle.load(open(f'Model_{type}_0.pkl', 'rb'))
    df_unscaled_both0, train_scores0 = compile_and_save(Model0, type)
    Model1 = pickle.load(open(f'Model_{type}_1.pkl', 'rb'))
    df_unscaled_both, train_scores1 = compile_and_save(Model1, type)
    Model2 = pickle.load(open(f'Model_{type}_2.pkl', 'rb'))
    df_unscaled_both2, train_scores2 = compile_and_save(Model2, type)

    # Average scores
    df_unscaled_both['normality_score'] = (df_unscaled_both['normality_score'] + df_unscaled_both2['normality_score'] + df_unscaled_both0['normality_score'])/3
    train_scores = (train_scores1 + train_scores2 + train_scores0)/3
    
    df_unscaled_train = pd.read_csv(f'Unscaled/df_train_{type}.csv')

    Model1.df = df_unscaled_both
    Model1.train_scores = train_scores

    plotly_colors = px.colors.qualitative.Plotly
    blue = plotly_colors[0]
    red = plotly_colors[1]
    # Add colors to df_unscaled_both for test and train
    df_unscaled_both['Data'] = 'Train'
    df_unscaled_both.loc[len(df_unscaled_train):, 'Data'] = 'Test'

    scores = df_unscaled_both['normality_score']

    median = np.median(train_scores)
    mad = scale * 1.48 * np.median(np.abs(train_scores - median))

    for i in range(len(scores)):
        if i < len(train_scores):
            continue
        if scores[i] < median - 1*mad:
            scores[i] = scores[i] - 1*mad

    # Scores below -4 mad are set to -4 mad
    # scores[scores < median - 5*mad] = median - 5*mad

    df_unscaled_both['normality_score'] = scores

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=df_unscaled_both.index, y=df_unscaled_both['normality_score'], mode='markers', showlegend=False,
                                    marker=dict(color=df_unscaled_both['Data'].map({'Train': blue, 'Test': red}))))
 
    line_types = ['solid', 'dash', 'dashdot', 'dot']
    y_vals = [median, median-mad, median-2*mad, median-3*mad]
    for i, y_val in enumerate(y_vals):
        fig.add_shape(type='line', x0=df_unscaled_both.index[0], y0=y_val, x1=df_unscaled_both.index[-1], y1=y_val,
                        line=dict(color='grey', width=1, dash=line_types[i]), opacity=0.8)


    # All values below -3 mad, add circle annotation around them
    # Get indices of values below -3 mad
    below_3_mad = df_unscaled_both[df_unscaled_both['normality_score'] < median-3*mad].index
    Model1.abnormal_observations = df_unscaled_both.loc[below_3_mad]
    below_3_scores = df_unscaled_both['normality_score'][below_3_mad]
    width = 0.75
    for i, score in zip(below_3_mad, below_3_scores):
        x0 = i - pd.Timedelta(days=width)
        x1 = i + pd.Timedelta(days=width)
        if type == 'sleep':
            height = mad/2
        else:   
            height = mad/2.5

        fig.add_shape(type="circle",
            xref="x", yref="y",
            x0=x0, y0=score + height, x1=x1, y1=score - height,
            line=dict(
                color="LightSeaGreen", width=1.5
            ),
        )

    lower_x = df_unscaled_both.index[0] - pd.Timedelta(days=1)
    upper_x = df_unscaled_both.index[-1] + pd.Timedelta(days=1)
    fig.update_xaxes(range=[lower_x, upper_x])


    fig.update_yaxes(tickvals=[median-3*mad, median-2*mad, median-mad, median],
                        ticktext=['$-3 MAD$', '$-2 MAD$', '$- MAD$', '$Median$'])
    
    # Dummy trace to add legend
    fig.add_trace(go.Scatter(x=[0], y=[0], mode='markers', marker=dict(color=blue), name='Training data'))
    fig.add_trace(go.Scatter(x=[0], y=[0], mode='markers', marker=dict(color=red), name='Testing data'))
    fig.update_layout(margin=dict(r=20, t=20))

    # Marker size
    fig.update_traces(marker=dict(size=8))

    fig.update_layout(coloraxis_showscale=False)
    # Legend horizontal orientation and top right position
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ))

    return fig, Model1

from plotly.subplots import make_subplots

fig, Model = plot_all_scores('sleep', scale=1)

# Add axis titles
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Normality score')
fig.show()

# Save figure
fig.write_image("Results/anomalous_sleep.png", width=1200, height=500, scale=4)

59
59
59


In [90]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from CNDE import Models
from CNDE import perform_CNDE, test_ensemble
import pandas as pd
import numpy as np
import pickle

# Turn off user warnings
import warnings
warnings.filterwarnings('ignore')

contamination = 0.05
for i in range(3):
    train_path = 'df_train_sleep.csv'
    test_path = 'df_test_sleep.csv'
    contamination = 0.05

    print(f'Model_sleep begins: (k=3, contamination={contamination})')
    Model_sleep = Models(train_path, k=3, contamination=contamination)
    Model_sleep.instantiate_models()
    Model_sleep = perform_CNDE(Model_sleep)
    Model_sleep = test_ensemble(Model_sleep, test_path)

    with open(f'Model_sleep_{i}.pkl', 'wb') as f:
        pickle.dump(Model_sleep, f)


Model_sleep begins: (k=3, contamination=0.05)
Training ensemble...
Training data point: 37/37
 --------------------- 
Updating weights
 ---------------------
Model IsolationForest performance: 31.0/37. Weight: 1 -> 0.8378378378378378
Model LocalOutlierFactor performance: 37.0/37. Weight: 1 -> 1.0
Model OneClassSVM performance: 26.0/37. Weight: 1 -> 0.7027027027027026
Model EllipticEnvelope performance: 37.0/37. Weight: 1 -> 1.0
Training complete. 
------------------
------------------ 

Model IsolationForest ECS: [ 7.66070390e-01  1.30309708e+00 -2.52562767e-01  7.98488814e-01
 -6.65212762e-01 -1.68247103e+00  7.81451310e-02  5.29021292e-01
 -1.47753308e+00  1.12063480e+00  1.10562736e+00  9.12291422e-01
  1.84531454e-01 -1.54155088e+00 -1.74553072e+00 -1.81230243e+00
  1.66756762e-01  1.12647930e+00  8.26844971e-01 -1.07060234e+00
 -3.29585370e-01  9.43780693e-01  6.88929605e-01  2.87913602e-01
 -1.81230243e+00 -1.79639108e+00 -1.83277904e-02 -1.73992976e+00
  7.29440672e-01  9.168915

In [93]:
# Add 'Abnormal' column to models.df
Model.df['Abnormal'] = False
Model.df.loc[Model.abnormal_observations.index, 'Abnormal'] = True
Model.df_abnormal = Model.df[Model.df['Abnormal'] == True]

Model.df_abnormal.drop(columns=['Start time short', 'Data'], inplace=True)
# Model.df_abnormal
Model.df_abnormal.to_csv('Results/anomalous_sleep.csv')

# df_abnormal = pd.read_csv('Results/anomalous_sleep.csv', index_col=0)
# # Calculate False Positive Rate
# # False positive rate = number of false positives / number of data points
# false_positives = df_abnormal[df_abnormal['False Positive'] == True]
# false_positive_rate = len(false_positives) / len(df_abnormal)
# print('False positive rate: ', false_positive_rate)

PermissionError: [Errno 13] Permission denied: 'Results/anomalous_sleep.csv'

In [175]:
import plotly.express as px


def plot_all_scores(Model1, scale):
    df = Model1.df
    dff = df[df['Activity'] == 'Entertain_Guests']
    train_scores = Model1.train_scores

    plotly_colors = px.colors.qualitative.Plotly
    blue = plotly_colors[0]
    red = plotly_colors[1]

    scores = df['normality_score']

    median = np.median(train_scores)
    mad = scale * 1.48 * np.median(np.abs(train_scores - median))

    for i in range(len(scores)):
        if i < len(train_scores):
            continue
        if scores[i] < median - 1*mad:
            scores[i] = scores[i] - 1*mad

    # Scores below -4 mad are set to -4 mad
    scores[scores < median - 5*mad] = median - 5*mad

    df['normality_score'] = scores

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=dff.index, y=dff['normality_score'], mode='markers', showlegend=False,
                                    marker=dict(color=dff['Data'].map({'Train': blue, 'Test': red}))))

 
    line_types = ['solid', 'dash', 'dashdot', 'dot']
    y_vals = [median, median-mad, median-2*mad, median-3*mad]
    for i, y_val in enumerate(y_vals):
        fig.add_shape(type='line', x0=dff.index[0], y0=y_val, x1=dff.index[-1], y1=y_val,
                        line=dict(color='grey', width=1, dash=line_types[i]), opacity=0.8)


    # All values below -3 mad, add circle annotation around them
    # Get indices of values below -3 mad
    below_3_mad = Model1.abnormal_observations.index

    # below_3_scores = df['normality_score'][below_3_mad]
    # width = 1
    # for i, score in zip(below_3_mad, below_3_scores):
    #     x0 = i - pd.Timedelta(days=width)
    #     x1 = i + pd.Timedelta(days=width)
    #     if type == 'sleep':
    #         height = mad/2
    #     else:   
    #         height = mad/2.5

    #     fig.add_shape(type="circle",
    #         xref="x", yref="y",
    #         x0=x0, y0=score + height, x1=x1, y1=score - height,
    #         line=dict(
    #             color="LightSeaGreen", width=1.5
    #         ),
    #     )

    lower_x = dff.index[0] - pd.Timedelta(days=1)
    upper_x = dff.index[-1] + pd.Timedelta(days=1)
    fig.update_xaxes(range=[lower_x, upper_x])
    fig.update_traces(marker=dict(size=3))


    fig.update_yaxes(tickvals=[median-3*mad, median-2*mad, median-mad, median],
                        ticktext=['$-3 MAD$', '$-2 MAD$', '$- MAD$', '$Median$'])

    # Marker size
    fig.update_traces(marker=dict(size=5))

    return fig, Model1

from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1, subplot_titles=('Combined activities', 'Sleep'))

# Add axis titles
fig.update_xaxes(title_text='Date', col=1, row=2)
fig.update_yaxes(title_text='Normality score', col=1, row=1)
fig.update_yaxes(title_text='Normality score', col=1, row=2)


fig, Model = plot_all_scores(Model_combined, 3)
fig.show()
