In [34]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from CNDE import Models
from CNDE import perform_CNDE, test_ensemble
import pandas as pd
import numpy as np
import pickle

# Turn off user warnings
import warnings
warnings.filterwarnings('ignore')

train_path = 'df_train_frequency.csv'
test_path = 'df_test_frequency.csv'
contamination = 0.05
for i in range(3):
    print(f'Model_other begins: (k=3, contamination={contamination})')
    Model_freq = Models(train_path, k=3, contamination=contamination)
    Model_freq.instantiate_models()
    Model_freq = perform_CNDE(Model_freq)
    Model_freq = test_ensemble(Model_freq, test_path)

    with open(f'Models/Model_frequency_{i}.pkl', 'wb') as f:
        pickle.dump(Model_freq, f)

Model_other begins: (k=3, contamination=0.05)
Training ensemble...
Training data point: 30/30
 --------------------- 
Updating weights
 ---------------------
Model IsolationForest performance: 24.0/30. Weight: 1 -> 0.8
Model LocalOutlierFactor performance: 30.0/30. Weight: 1 -> 1.0
Model OneClassSVM performance: 7.0/30. Weight: 1 -> 0.23333333333333328
Model EllipticEnvelope performance: 28.0/30. Weight: 1 -> 0.9333333333333333
Training complete. 
------------------
------------------ 

Model IsolationForest ECS: [ 1.3403467   1.38934126  0.96629218  0.56692438  1.59504727  0.12484658
  1.60309679  2.5604673   1.68478229  0.71603861 -1.30622463  0.42103124
  0.12299412  0.46735112  0.83702481 -0.51085995  0.30785281 -1.32343691
 -0.83947562 -1.16190915  0.12276963  0.63505562 -0.38507629 -1.60381219
  0.63155094  0.15598677 -0.65984557 -0.80241635  1.63250274 -1.44025606
  1.53000769  1.18819279  0.48910094 -0.40506823 -0.14867324  0.59015657
  0.23707351 -0.30051611 -0.53930675 -2.034

In [14]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import pickle

# Turn off user warnings
import warnings
warnings.filterwarnings('ignore')

Model_freq_1 = pickle.load(open('Models/Model_frequency_1.pkl', 'rb'))

def compile_and_save(model, type):
    scores = model.normality_scores

    path_train = f'Unscaled/df_train_{type}.csv'
    path_test = f'Unscaled/df_test_{type}.csv'

    df_unscaled_train = pd.read_csv(path_train)
    train_scores = scores[:len(df_unscaled_train)]
    df_unscaled_test = pd.read_csv(path_test)

    # Concatenate train and test
    df_unscaled_both = pd.concat([df_unscaled_train, df_unscaled_test], axis=0)
    df_unscaled_both['normality_score'] = scores

    # Get new indices
    df_unscaled_both = df_unscaled_both.reset_index(drop=True)
    df_unscaled_both = df_unscaled_both.set_index('Date')

    # Convert to datetime
    df_unscaled_both.index = pd.to_datetime(df_unscaled_both.index)

    # Save to csv
    df_unscaled_both.to_csv(f'Results/df_{type}.csv', index=False)

    return df_unscaled_both, train_scores

df_unscaled_both, train_scores = compile_and_save(Model_freq_1, 'frequency')
type(df_unscaled_both.index[0])

pandas._libs.tslibs.timestamps.Timestamp

In [26]:
type = 'frequency'

Model0 = pickle.load(open(f'Models/Model_{type}_0.pkl', 'rb'))
Model1 = pickle.load(open(f'Models/Model_{type}_1.pkl', 'rb'))
Model2 = pickle.load(open(f'Models/Model_{type}_2.pkl', 'rb'))

print('Score lengths:', len(Model0.normality_scores), len(Model1.normality_scores), len(Model2.normality_scores))

Score lengths: 61 61 61


In [94]:
import plotly.express as px
import plotly.graph_objects as go

def plot_all_scores(type, scale):
    Model0 = pickle.load(open(f'Models/Model_{type}_0.pkl', 'rb'))
    df_unscaled_both, train_scores0 = compile_and_save(Model0, type)
    Model1 = pickle.load(open(f'Models/Model_{type}_1.pkl', 'rb'))
    df_unscaled_both, train_scores1 = compile_and_save(Model1, type)
    Model2 = pickle.load(open(f'Models/Model_{type}_2.pkl', 'rb'))
    df_unscaled_both2, train_scores2 = compile_and_save(Model2, type)

    # Average scores
    df_unscaled_both['normality_score'] = (df_unscaled_both['normality_score'] + df_unscaled_both2['normality_score'] + df_unscaled_both['normality_score'])/3
    train_scores = (train_scores1 + train_scores2 + train_scores0)/3

    df_unscaled_train = pd.read_csv(f'Unscaled/df_train_{type}.csv')

    Model1.df = df_unscaled_both
    Model1.train_scores = train_scores

    plotly_colors = px.colors.qualitative.Plotly
    blue = plotly_colors[0]
    red = plotly_colors[1]
    # Add colors to df_unscaled_both for test and train
    df_unscaled_both['Data'] = 'Train'
    df_unscaled_both.loc[len(df_unscaled_train):, 'Data'] = 'Test'

    scores = df_unscaled_both['normality_score']

    mean = np.mean(train_scores)
    std = np.std(train_scores)

    for i in range(len(scores)):
        if i < len(train_scores):
            continue
        if scores[i] < mean - 1*std:
            scores[i] = scores[i] - 1*std

    # Scores below -4 mad are set to -4 mad
    # scores[scores < median - 5*mad] = median - 5*mad

    df_unscaled_both['normality_score'] = scores

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=df_unscaled_both.index, y=df_unscaled_both['normality_score'], mode='markers', showlegend=False,
                                    marker=dict(color=df_unscaled_both['Data'].map({'Train': blue, 'Test': red}))))

 
    line_types = ['solid', 'dash', 'dashdot', 'dot']
    y_vals = [mean, mean-std, mean-2*std, mean-3*std]
    for i, y_val in enumerate(y_vals):
        fig.add_shape(type='line', x0=df_unscaled_both.index[0], y0=y_val, x1=df_unscaled_both.index[-1], y1=y_val,
                        line=dict(color='grey', width=1, dash=line_types[i]), opacity=0.8)


    # All values below -3 mad, add circle annotation around them
    # Get indices of values below -3 mad
    below_3_mad = df_unscaled_both[df_unscaled_both['normality_score'] < mean-3*std].index
    Model1.abnormal_observations = df_unscaled_both.loc[below_3_mad]
    below_3_scores = df_unscaled_both['normality_score'][below_3_mad]
    width = 0.75
    for i, score in zip(below_3_mad, below_3_scores):
        x0 = i - pd.Timedelta(days=width)
        x1 = i + pd.Timedelta(days=width)
        height = std/1.5

        fig.add_shape(type="circle",
            xref="x", yref="y",
            x0=x0, y0=score + height, x1=x1, y1=score - height,
            line=dict(
                color="LightSeaGreen", width=1.5
            ),
        )

    lower_x = df_unscaled_both.index[0] - pd.Timedelta(days=1)
    upper_x = df_unscaled_both.index[-1] + pd.Timedelta(days=1)
    fig.update_xaxes(range=[lower_x, upper_x])
    fig.update_traces(marker=dict(size=3))

    fig.update_yaxes(tickvals=[mean-3*std, mean-2*std, mean-std, mean],
                        ticktext=['$-3\sigma$', '$-2\sigma$', '$-\sigma$', '$\mu$'])

    # Dummy trace to add legend
    fig.add_trace(go.Scatter(x=[0], y=[0], mode='markers', marker=dict(color=blue), name='Training data'))
    fig.add_trace(go.Scatter(x=[0], y=[0], mode='markers', marker=dict(color=red), name='Testing data'))
    fig.update_layout(margin=dict(r=20, t=20))

    # Marker size
    fig.update_traces(marker=dict(size=8))

    fig.update_layout(coloraxis_showscale=False)
    # Legend horizontal orientation and top right position
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ))

    return fig, Model1

from plotly.subplots import make_subplots
fig, Model_freq = plot_all_scores('frequency', 0.8)


# Add axis titles
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Normality score')
fig.show()

# Save figure
fig.write_image("Results/anomalous_frequency.png", width=1200, height=500, scale=4)

In [92]:
for day in ['Weekday', 'Weekend']:
    average_day = pd.read_csv(f'Unscaled/average_{day}_day.csv')
    # average_day = pd.read_csv('Unscaled/average_weekday_day.csv')

    columns = average_day.columns[3:]
    activites = set([column.split(' ')[1] for column in columns])

    # Plot frequency on one axis and duration on another
    fig = make_subplots(cols=2, rows=1)

    # Iterate through each activity
    for activity in activites:
        # Get average frequency and duration
        average_frequency = average_day[f'Total {activity} frequency'].iloc[0]
        average_duration = average_day[f'Total {activity} duration'].iloc[0]

        # Plot
        fig.add_trace(
            go.Bar(
                x=[activity],
                y=[average_frequency],
                name=f'Total {activity} frequency',
                offsetgroup=0,
                marker_color='rgb(55, 83, 109)',
                showlegend=False,
            ),
            col=1, row=1,
        )

        fig.add_trace(
            go.Bar(
                x=[activity],
                y=[average_duration],
                name=f'Total {activity} duration',
                offsetgroup=1,
                marker_color='rgb(26, 118, 255)',
                showlegend=False,
            ),
            col=2, row=1,
        )

    # Set layout
    fig.update_layout( barmode='group')
    fig.update_xaxes(tickangle=-45, col=1, row=1)
    fig.update_xaxes(tickangle=-45, col=2, row=1)

    fig.update_xaxes(title_text='Activity', col=1, row=1)
    fig.update_xaxes(title_text='Activity', col=2, row=1)
    fig.update_yaxes(title_text='Change in frequency', col=1, row=1)
    fig.update_yaxes(title_text='Percentage increase in duration', col=2, row=1)
    fig.update_layout(margin=dict(t=20, r=20))

    fig.show()
    fig.write_image(f"Results/average_{day}.png", width=1200, height=350, scale=4)


In [99]:
def plot_day(example_day, title, date):
    print(title)
    average_day = pd.read_csv('Unscaled/average_day.csv')

    columns = example_day.columns[3:]
    activites = set([column.split(' ')[1] for column in columns])

    # Plot frequency on one axis and duration on another
    fig = make_subplots(cols=2, rows=1)

    # Iterate through each activity
    for activity in activites:
        # Get total frequency and duration
        total_frequency = example_day[f'Total {activity} frequency'].iloc[0]
        total_duration = example_day[f'Total {activity} duration'].iloc[0]

        # Get average frequency and duration
        average_frequency = average_day[f'Total {activity} frequency'].iloc[0]
        average_duration = average_day[f'Total {activity} duration'].iloc[0]

        # Get percentage difference
        diff_frequency = total_frequency - average_frequency
        diff_duration = (total_duration - average_duration) / average_duration * 100

        # Plot
        fig.add_trace(
            go.Bar(
                x=[activity],
                y=[diff_frequency],
                name=f'Total {activity} frequency',
                offsetgroup=0,
                marker_color='rgb(55, 83, 109)',
                showlegend=False,
            ),
            col=1, row=1,
        )

        fig.add_trace(
            go.Bar(
                x=[activity],
                y=[diff_duration],
                name=f'Total {activity} duration',
                offsetgroup=1,
                marker_color='rgb(26, 118, 255)',
                showlegend=False,
            ),
            col=2, row=1,
        )

    # Set layout
    fig.update_layout( barmode='group', margin=dict(t=30, r=20))
    fig.update_xaxes(tickangle=-45, col=1, row=1)
    fig.update_xaxes(tickangle=-45, col=2, row=1)

    fig.update_xaxes(title_text='Activity', col=1, row=1)
    fig.update_xaxes(title_text='Activity', col=2, row=1)
    fig.update_yaxes(title_text='Change in frequency', col=1, row=1)
    fig.update_yaxes(title_text='Percentage increase in duration', col=2, row=1)

    fig.show()
    fig.write_image(f"Results/{date}.png", width=1200, height=350, scale=4)


In [100]:
df_freq = pd.read_csv('Activity_frequency.csv')

abnormal_scores = Model_freq.abnormal_observations['normality_score']
abnormal_scores = abnormal_scores.sort_values(ascending=True)
abnormal_dates = abnormal_scores.index

# Plot all abnormal days
for dates in abnormal_dates:
    # Convert to string, only keep date
    date = str(dates).split(' ')[0]
    df_freq_day = df_freq[df_freq['Date'] == date]

    plot_title = f'Day: {date}, Normality score: {abnormal_scores[dates]}'

    plot_day(df_freq_day, plot_title, date)



Day: 2011-08-02, Normality score: -2.74434550284658


Day: 2011-08-04, Normality score: -2.649091580645578


Day: 2011-08-01, Normality score: -1.84563465393158


Day: 2011-07-31, Normality score: -1.4748297573237081


Day: 2011-08-06, Normality score: -1.2816234642129707


Day: 2011-07-24, Normality score: -1.0709779910700485


Day: 2011-08-03, Normality score: -0.8103795216982127


Day: 2011-07-19, Normality score: -0.6664619861614824
