In [140]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from CNDE import Models
from CNDE import perform_CNDE, test_ensemble
import pandas as pd
import numpy as np
import pickle

# Turn off user warnings
import warnings
warnings.filterwarnings('ignore')

train_path = 'df_train_other.csv'
test_path = 'df_test_other.csv'
contamination = 0.05
for i in range(1, 3):
    print(f'Model_other begins: (k=3, contamination={contamination})')
    Model_other = Models(train_path, k=3, contamination=contamination)
    Model_other.instantiate_models()
    Model_other = perform_CNDE(Model_other)
    Model_other = test_ensemble(Model_other, test_path)

    train_path = 'df_train_sleep.csv'
    test_path = 'df_test_sleep.csv'
    contamination = 0.05

    print(f'Model_sleep begins: (k=3, contamination={contamination})')
    Model_sleep = Models(train_path, k=3, contamination=contamination)
    Model_sleep.instantiate_models()
    Model_sleep = perform_CNDE(Model_sleep)
    Model_sleep = test_ensemble(Model_sleep, test_path)

    train_path = 'df_train_combined.csv'
    test_path = 'df_test_combined.csv'
    contamination = 0.05

    print(f'Model_combined begins: (k=3, contamination={contamination})')
    Model_combined = Models(train_path, k=3, contamination=contamination)
    Model_combined.instantiate_models()
    Model_combined = perform_CNDE(Model_combined)
    Model_combined = test_ensemble(Model_combined, test_path)

    # Save the models
    with open(f'Model_other_{i}.pkl', 'wb') as f:
        pickle.dump(Model_other, f)

    with open(f'Model_sleep_{i}.pkl', 'wb') as f:
        pickle.dump(Model_sleep, f)

    with open(f'Model_combined_{i}.pkl', 'wb') as f:
        pickle.dump(Model_combined, f)

Model_other begins: (k=3, contamination=0.05)
Training ensemble...
Training data point: 1440/1440
 --------------------- 
Updating weights
 ---------------------
Model IsolationForest performance: 1379.0/1440. Weight: 1 -> 0.9576388888888889
Model LocalOutlierFactor performance: 1411.0/1440. Weight: 1 -> 0.9798611111111111
Model OneClassSVM performance: 1377.0/1440. Weight: 1 -> 0.95625
Model EllipticEnvelope performance: 1411.0/1440. Weight: 1 -> 0.9798611111111111
Training complete. 
------------------
------------------ 

Model IsolationForest ECS: [ 0.38671586  0.69760751  1.22588973 ... -1.31563031 -0.62893136
 -0.39871422]
Model IsolationForest CICS: [ 0.38671586  0.69760751  1.22588973 ... -1.31563031 -0.62893136
 -0.39871422]
Model LocalOutlierFactor ECS: [0.1421564  0.14944597 0.15436342 ... 0.05439281 0.11163821 0.12394938]
Model LocalOutlierFactor CICS: [0.1421564  0.14944597 0.15436342 ... 0.05439281 0.11163821 0.12394938]
Model OneClassSVM ECS: [ 0.1421245   0.26843415  0.

In [42]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from CNDE import Models
from CNDE import perform_CNDE, test_ensemble
import pandas as pd
import numpy as np

def plot_scores(Model1, score_type='ecs', normalise=True, name=None):
    type = name.split('_')[1]
    df_unscaled_train = pd.read_csv(f'Unscaled/df_train_{type}.csv')
    
    ECS_scores = []
    CICS_scores = []
    normality_scores = Model1.normality_scores
    model_names = []
    weights = []
    for model in Model1.models:
        model_names.append(model)
        ECS_scores.append(Model1.models[model]['ecs'])
        CICS_scores.append(Model1.models[model]['cics'])
        weights.append(Model1.models[model]['weights'])


    # Only record model weights to 2 decimal places
    plot_names = [f'{model_names[i]}' for i in range(len(model_names))]
    # Plot ECS scores
    fig = make_subplots(rows=2, cols=2, subplot_titles=plot_names, vertical_spacing=0.2, horizontal_spacing=0.1)

    for i, model in enumerate(model_names):
        if score_type == 'ecs':
            train_scores = ECS_scores[i][:len(df_unscaled_train)]
            test_scores = ECS_scores[i][len(df_unscaled_train):]
        elif score_type == 'cics':
            train_scores = CICS_scores[i][:len(df_unscaled_train)]
            test_scores = CICS_scores[i][len(df_unscaled_train):]

        if normalise:
            # Concatenate train and test scores
            all_scores = np.concatenate((train_scores, test_scores))
            # Standardise scores
            train_scores = (train_scores - np.mean(all_scores))/np.std(all_scores)
            test_scores = (test_scores - np.mean(all_scores))/np.std(all_scores)


        # mean and std
        mean = np.mean(train_scores)
        std = np.std(train_scores)
        
        # Any values below -4 std are set to -4 std
        train_scores[train_scores < mean - 4*std] = mean - 4*std
        test_scores[test_scores < mean - 4*std] = mean - 4*std

        if i == 0:
            legend = True
        else:
            legend = False

        fig.add_trace(go.Histogram(x=train_scores, nbinsx=100, marker=dict(color='blue'), showlegend=legend,
                                name='Train Scores'), row=i//2+1, col=i%2+1)
        fig.add_trace(go.Histogram(x=test_scores, nbinsx=100, marker=dict(color='red'), showlegend=legend,
                                    name='Test Scores'), row=i//2+1, col=i%2+1)
        
        # axis labels
        fig.update_xaxes(title_text='Normality Score', row=i//2+1, col=i%2+1)
        fig.update_yaxes(title_text='Frequency', row=i//2+1, col=i%2+1)
    
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.07,
        xanchor="right",
        x=1
    ))

    fig.show()
    fig.write_image(f'Results/model_dists_{name}.png', width=1200, height=600, scale=4)

plot_scores(Model_other, score_type='ecs', normalise=True, name='normalised_other')

# plot_scores(Model_sleep, score_type='ecs', normalise=True, name='normalised_sleep')
# plot_scores(Model_combined, score_type='ecs', normalise=True, name='normalised_combined')

In [46]:
df_unscaled_train = pd.read_csv('Unscaled/df_train_other.csv')

# Plot distribution of normality scores as scatter plot
train_scores = Model_other.normality_scores[:len(df_unscaled_train)]
test_scores = Model_other.normality_scores[len(df_unscaled_train):]
all_scores = np.concatenate((train_scores, test_scores))

# Standardise scores
train_scores = (train_scores - np.mean(all_scores))/np.std(all_scores)
test_scores = (test_scores - np.mean(all_scores))/np.std(all_scores)

# mean and std
mean = np.mean(train_scores)
std = np.std(train_scores)

# Any values below -4 std are set to -4 std
train_scores[train_scores < mean - 4*std] = mean - 4*std
test_scores[test_scores < mean - 4*std] = mean - 4*std

fig = go.Figure()
fig.add_trace(go.Histogram(x=train_scores, nbinsx=100, marker=dict(color='blue'), showlegend=True,
                            name='Train Scores'))
fig.add_trace(go.Histogram(x=test_scores, nbinsx=100, marker=dict(color='red'), showlegend=True,
                            name='Test Scores'))
fig.update_layout(xaxis_title='Normality Score', yaxis_title='Frequency', legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="right",
        x=1
    ))
fig.show()
fig.write_image('Results/model_dists_other.png', width=1200, height=400, scale=4)

In [7]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import pickle

# Turn off user warnings
import warnings
warnings.filterwarnings('ignore')


Model_other = pickle.load(open('Model_other.pkl', 'rb'))
df_unscaled_train = pd.read_csv('Unscaled/df_train_other.csv')

def compile_and_save(model, type):
    scores = model.normality_scores
    print(len(scores))

    path_train = f'Unscaled/df_train_{type}.csv'
    path_test = f'Unscaled/df_test_{type}.csv'

    df_unscaled_train = pd.read_csv(path_train)
    train_scores = scores[:len(df_unscaled_train)]
    df_unscaled_test = pd.read_csv(path_test)

    # Concatenate train and test
    df_unscaled_both = pd.concat([df_unscaled_train, df_unscaled_test], axis=0)
    df_unscaled_both['normality_score'] = scores

    # Get new indices
    df_unscaled_both = df_unscaled_both.reset_index(drop=True)
    df_unscaled_both['Start time'] = pd.to_datetime(df_unscaled_both['Start time'])
    df_unscaled_both = df_unscaled_both.set_index('Start time')


    # Save to csv
    df_unscaled_both.to_csv(f'Results/df_{type}.csv', index=False)

    return df_unscaled_both, train_scores

df_unscaled_both, train_scores = compile_and_save(Model_other, 'other')

2573


In [96]:
df_unscaled_both.columns

Index(['Activity', 'Duration', 'Number of interruptions',
       'Duration of interruptions', 'Day of activity', 'Weekend or weekday',
       'Start time short', 'normality_score', 'color'],
      dtype='object')

In [224]:
import plotly.express as px


def plot_all_scores(fig, type, scale, row):
    Model1 = pickle.load(open(f'Model_{type}.pkl', 'rb'))
    df_unscaled_both, train_scores1 = compile_and_save(Model1, type)
    Model2 = pickle.load(open(f'Model_{type}_1.pkl', 'rb'))
    df_unscaled_both2, train_scores2 = compile_and_save(Model2, type)
    Model3 = pickle.load(open(f'Model_{type}_2.pkl', 'rb'))
    df_unscaled_both3, train_scores3 = compile_and_save(Model3, type)

    # Average scores
    df_unscaled_both['normality_score'] = (df_unscaled_both['normality_score'] + df_unscaled_both2['normality_score'] + df_unscaled_both3['normality_score'])/3
    train_scores = (train_scores1 + train_scores2 + train_scores3)/3

    df_unscaled_train = pd.read_csv(f'Unscaled/df_train_{type}.csv')

    Model1.df = df_unscaled_both
    Model1.train_scores = train_scores

    plotly_colors = px.colors.qualitative.Plotly
    blue = plotly_colors[0]
    red = plotly_colors[1]
    # Add colors to df_unscaled_both for test and train
    df_unscaled_both['Data'] = 'Train'
    df_unscaled_both.loc[len(df_unscaled_train):, 'Data'] = 'Test'

    scores = df_unscaled_both['normality_score']

    median = np.median(train_scores)
    mad = scale * 1.48 * np.median(np.abs(train_scores - median))

    for i in range(len(scores)):
        if i < len(train_scores):
            continue
        if scores[i] < median - 1*mad:
            scores[i] = scores[i] - 1*mad

    # Scores below -4 mad are set to -4 mad
    scores[scores < median - 5*mad] = median - 5*mad

    df_unscaled_both['normality_score'] = scores


    # if type == 'sleep':
    #     fig.add_trace(go.Scatter(x=df_unscaled_both.index, y=df_unscaled_both['normality_score'], mode='lines', showlegend=False,
    #                                 line=dict(color='black', width=1)), col=1, row=row)

    fig.add_trace(go.Scatter(x=df_unscaled_both.index, y=df_unscaled_both['normality_score'], mode='markers', showlegend=False,
                                    marker=dict(color=df_unscaled_both['Data'].map({'Train': blue, 'Test': red}))), col=1, row=row)

 
    line_types = ['solid', 'dash', 'dashdot', 'dot']
    y_vals = [median, median-mad, median-2*mad, median-3*mad]
    for i, y_val in enumerate(y_vals):
        fig.add_shape(type='line', x0=df_unscaled_both.index[0], y0=y_val, x1=df_unscaled_both.index[-1], y1=y_val,
                        line=dict(color='grey', width=1, dash=line_types[i]), opacity=0.8, col=1, row=row)


    # All values below -3 mad, add circle annotation around them
    # Get indices of values below -3 mad
    below_3_mad = df_unscaled_both[df_unscaled_both['normality_score'] < median-3*mad].index
    Model1.abnormal_observations = df_unscaled_both.loc[below_3_mad]
    below_3_scores = df_unscaled_both['normality_score'][below_3_mad]
    width = 1
    for i, score in zip(below_3_mad, below_3_scores):
        x0 = i - pd.Timedelta(days=width)
        x1 = i + pd.Timedelta(days=width)
        if type == 'sleep':
            height = mad/2
        else:   
            height = mad/2.5

        fig.add_shape(type="circle",
            xref="x", yref="y",
            x0=x0, y0=score + height, x1=x1, y1=score - height,
            line=dict(
                color="LightSeaGreen", width=1.5
            ),
            row=row, col=1
            
        )

    lower_x = df_unscaled_both.index[0] - pd.Timedelta(days=1)
    upper_x = df_unscaled_both.index[-1] + pd.Timedelta(days=1)
    fig.update_xaxes(range=[lower_x, upper_x])
    fig.update_traces(marker=dict(size=3))


    fig.update_yaxes(tickvals=[median-3*mad, median-2*mad, median-mad, median],
                        ticktext=['$-3 MAD$', '$-2 MAD$', '$- MAD$', '$Median$'], col=1, row=row)


    # Marker size
    fig.update_traces(marker=dict(size=6))

    fig.update_layout(coloraxis_showscale=False)
    # Legend horizontal orientation and top right position
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ))

    return fig, Model1

from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1, subplot_titles=('Combined activities', 'Sleep'))

# Add axis titles
fig.update_xaxes(title_text='Date', col=1, row=2)
fig.update_yaxes(title_text='Normality score', col=1, row=1)
fig.update_yaxes(title_text='Normality score', col=1, row=2)


fig, Model_combined = plot_all_scores(fig, 'combined', 3, 1)
fig, Model_sleep = plot_all_scores(fig, 'sleep', 0.8, 2)
fig.show()

# Save figure
fig.write_image("Results/combined_sleep_2.png", width=1000, height=600, scale=4)

2632
2632
2632
59
59
59


In [216]:
df = Model_combined.df
dfx = df[df['Activity'] == 'Entertain_Guests']
dfy = df[df['Activity'] == 'Other_Activity']

# Plot durations against index, shared x axis, different y axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=dfy.index, y=dfy['Duration'], mode='markers', name='Other Activity'), secondary_y=True)
fig.add_trace(go.Scatter(x=dfx.index, y=dfx['Duration'], mode='markers', name='Entertain Guests'), secondary_y=False)

fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Duration (min) for Other Activity', secondary_y=True)
fig.update_yaxes(title_text='Duration (min) for Entertain Guests', secondary_y=False)

# Add vertical line at start of test data: July 14 2011
fig.add_shape(type='line', x0='2011-07-14', y0=0, x1='2011-07-14', y1=950, 
              line=dict(color='grey', width=1, dash='dot'), opacity=0.8)
# Annotation to say start of test data
fig.add_annotation(x='2011-07-14', y=1000, text='Start of test data', showarrow=False, font=dict(size=12))

# Horizontal legend
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.write_image("Results/anomaly_exp.png", width=1200, height=400, scale=4)


In [215]:
# Add 'Abnormal' column to models.df
Model_combined.df['Abnormal'] = False
Model_combined.df.loc[Model_combined.abnormal_observations.index, 'Abnormal'] = True
Model_combined.df_abnormal = Model_combined.df[Model_combined.df['Abnormal'] == True]
Model_combined.df.to_csv('Results/combined_scores.csv')

df_abnormal = pd.read_csv('Results/combined_abnormal.csv', index_col=0)
# Calculate False Positive Rate
# False positive rate = number of false positives / number of data points
false_positives = df_abnormal[df_abnormal['False Positive'] == True]
false_positive_rate = len(false_positives) / len(df_abnormal)
print('False positive rate: ', false_positive_rate)

False positive rate:  0.19047619047619047


In [175]:
import plotly.express as px


def plot_all_scores(Model1, scale):
    df = Model1.df
    dff = df[df['Activity'] == 'Entertain_Guests']
    train_scores = Model1.train_scores

    plotly_colors = px.colors.qualitative.Plotly
    blue = plotly_colors[0]
    red = plotly_colors[1]

    scores = df['normality_score']

    median = np.median(train_scores)
    mad = scale * 1.48 * np.median(np.abs(train_scores - median))

    for i in range(len(scores)):
        if i < len(train_scores):
            continue
        if scores[i] < median - 1*mad:
            scores[i] = scores[i] - 1*mad

    # Scores below -4 mad are set to -4 mad
    scores[scores < median - 5*mad] = median - 5*mad

    df['normality_score'] = scores

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=dff.index, y=dff['normality_score'], mode='markers', showlegend=False,
                                    marker=dict(color=dff['Data'].map({'Train': blue, 'Test': red}))))

 
    line_types = ['solid', 'dash', 'dashdot', 'dot']
    y_vals = [median, median-mad, median-2*mad, median-3*mad]
    for i, y_val in enumerate(y_vals):
        fig.add_shape(type='line', x0=dff.index[0], y0=y_val, x1=dff.index[-1], y1=y_val,
                        line=dict(color='grey', width=1, dash=line_types[i]), opacity=0.8)


    # All values below -3 mad, add circle annotation around them
    # Get indices of values below -3 mad
    below_3_mad = Model1.abnormal_observations.index

    # below_3_scores = df['normality_score'][below_3_mad]
    # width = 1
    # for i, score in zip(below_3_mad, below_3_scores):
    #     x0 = i - pd.Timedelta(days=width)
    #     x1 = i + pd.Timedelta(days=width)
    #     if type == 'sleep':
    #         height = mad/2
    #     else:   
    #         height = mad/2.5

    #     fig.add_shape(type="circle",
    #         xref="x", yref="y",
    #         x0=x0, y0=score + height, x1=x1, y1=score - height,
    #         line=dict(
    #             color="LightSeaGreen", width=1.5
    #         ),
    #     )

    lower_x = dff.index[0] - pd.Timedelta(days=1)
    upper_x = dff.index[-1] + pd.Timedelta(days=1)
    fig.update_xaxes(range=[lower_x, upper_x])
    fig.update_traces(marker=dict(size=3))


    fig.update_yaxes(tickvals=[median-3*mad, median-2*mad, median-mad, median],
                        ticktext=['$-3 MAD$', '$-2 MAD$', '$- MAD$', '$Median$'])

    # Marker size
    fig.update_traces(marker=dict(size=5))

    return fig, Model1

from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1, subplot_titles=('Combined activities', 'Sleep'))

# Add axis titles
fig.update_xaxes(title_text='Date', col=1, row=2)
fig.update_yaxes(title_text='Normality score', col=1, row=1)
fig.update_yaxes(title_text='Normality score', col=1, row=2)


fig, Model_combined = plot_all_scores(Model_combined, 3)
fig.show()
