In [2]:
import numpy as np
from sklearn.svm import OneClassSVM
from OCluDAL import OCluDAL

from sklearn.svm import OneClassSVM
# Path to the data
path = 'PreProcessingClassification\\USC\\CompiledData.csv'
OC = OCluDAL(path, annotations=200)

OC.initialise_data()
OC.preprocessing()

masks = []
# Novelty detection using OCSVM
for label in OC.unique_labels:
    # Fit OCSVM
    svm = OneClassSVM().fit(OC.labelled_X_new[OC.labelled_y_new == label])
    novel_mask_i = svm.predict(OC.unlabelled_X_new) == -1
    
    masks.append(novel_mask_i)

novel_mask = np.all(masks, axis=0)
novel_X = OC.unlabelled_X_new[novel_mask]
print(f"Novelty detected: {len(novel_X)}")

FileNotFoundError: [Errno 2] No such file or directory: 'PreProcessingClassification\\USC\\CompiledData.csv'

In [None]:
import pandas as pd

from sklearn.metrics import silhouette_score
from sklearn.cluster import AffinityPropagation

def fine_tune_affinity_propagation(OC_original, novel_X, damping_range, preference_range, max_annotations=600):
    """Fine tune the affinity propagation model. Score is measured by the
    silhouette score of the model on the novel data points.
    
    Parameters
    ----------
    OC : OCluDAL object
        OCluDAL object.
    novel_X : numpy array
        Novel data points.
    damping_range : list
        Range of damping values to try.
    preference_range : list
        Range of preference values to try.

    Returns
    -------
    scores_array : numpy array
        Array of scores for each combination of damping and preference.
    """

    scores_array = np.zeros((len(damping_range), len(preference_range)))
    annotations_array = np.zeros((len(damping_range), len(preference_range)))
    df = pd.DataFrame(columns=['Damping', 'Preference', 'Original Score', 'Score', 'Weighted Score', 'Annotations', 'Avg Annotations'])

    for di, damping in enumerate(damping_range):
        for pi, preference in enumerate(preference_range):
            OC = OC_original.copy()
            
            print(f"Progress: {di * len(preference_range) + pi + 1}/{len(damping_range) * len(preference_range)}, Damping: {damping}  |Preference: {preference}")
            
            sum_scores = 0
            iter = 0
            annotation_lengths = []
            try:

                while len(OC.labelled_X_new) < max_annotations:
                    masks = []
                    iter += 1

                    # Novelty detection using OCSVM
                    for label in OC.unique_labels:
                        # Fit OCSVM
                        svm = OneClassSVM().fit(OC.labelled_X_new[OC.labelled_y_new == label])
                        novel_mask_i = svm.predict(OC.unlabelled_X_new) == -1
                        
                        masks.append(novel_mask_i)

                    novel_mask = np.all(masks, axis=0)
                    novel_X = OC.unlabelled_X_new[novel_mask]

                    # Clustering to select representative samples for annotation using Affinity Propagation
                    if len(novel_X) > 0:
                        ap = AffinityPropagation(damping=damping, preference=preference)
                        ap.fit(novel_X)
                        representative_X = ap.cluster_centers_
                        annotation_lengths.append(len(representative_X))
                        # Get labels
                        labels = ap.labels_

                        # Calculate silhouette score
                        score = silhouette_score(novel_X, labels)
                        sum_scores += score

                    else:
                        break

                    # Find row indices of representative samples
                    representative_indices = []
                    for sample in representative_X:
                        representative_indices.append(np.where((OC.unlabelled_X_new == sample).all(axis=1))[0][0])


                    # Update labelled and unlabelled sets
                    OC.oracle_annotations(representative_indices)

                # Calculate average score and standardise it
                orig_score = sum_scores / iter
                score = orig_score * max_annotations / len(OC.labelled_X_new)
            
                # Calculate average number of annotations
                avg = np.mean(annotation_lengths)

                # Reward for lower average annotations
                weighted_score = score * (1 - avg / max_annotations)

            except:
                orig_score = np.nan
                score = np.nan
                avg = np.nan
                weighted_score = np.nan

            df = pd.concat([df, pd.DataFrame({'Damping': damping,
                                                'Preference': preference,
                                                'Original Score': orig_score,
                                                'Score': score,
                                                'Weighted Score': weighted_score,
                                                'Annotations': len(OC.labelled_X_new),
                                                'Avg Annotations': avg}, index=[0])], ignore_index=True)
            # df.to_csv('fine_tune_affinity_propagation.csv', index=False)
            
            scores_array[di, pi] = score
            annotations_array[di, pi] = avg

            print(f"Annotations: {len(OC.labelled_X_new)}  |Score: {score}  |Avg annotations: {avg}")
            print('____________________________________________________')

    return scores_array

damping_range = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
preference_range = [-200, -190, -180, -170, -160, -150, -140, -130, -120, -110, -100, -90, -80, -70, -60, -50, -40, -30, -20, -10]

scores_array = fine_tune_affinity_propagation(OC, novel_X, damping_range, preference_range)

In [4]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np

df = pd.read_csv('fine_tune_affinity_propagation.csv')
# Remove all damping values above 0.9
df = df[df['Damping'] <= 0.9]

# Remove all average annotations above 100
df = df[df['Avg Annotations'] <= 200]

# Calculate weighted score
# df['Weighted Score'] = df['Score'] ** 2 / df['Avg Annotations']

# Plot heatmap of weighted scores
import plotly.graph_objects as go
from plotly.subplots import make_subplots

subtitle1 = r'$\text{Average silhouette score: } S_i$'
subtitle2 = r'$\text{Weighted average silhouette score: } \frac{S_i}{\text{Avg Annotations}}$'

# Latex formatting
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'heatmap'}, {'type': 'heatmap'}]], subplot_titles=(subtitle1, subtitle2))

fig.add_trace(go.Heatmap(
                    z=df['Score'].values.reshape(len(df['Damping'].unique()), len(df['Preference'].unique())),
                    x=df['Preference'].unique(),
                    y=df['Damping'].unique(),
                    colorscale='Viridis'),
                    row=1, col=1)

# Highlight top 5 values with red border
df = df.sort_values(by='Score', ascending=False)
for i in range(2):
    fig.add_shape(
        type="rect",
        xref="x",
        yref="y",
        x0=df.iloc[i]['Preference'] - 5,
        y0=df.iloc[i]['Damping'] - 0.025,
        x1=df.iloc[i]['Preference'] + 5,
        y1=df.iloc[i]['Damping'] + 0.025,
        line=dict(
            color="Red",
            width=2
        ),
        row=1, col=1
    )

df = pd.read_csv('fine_tune_affinity_propagation.csv')
df = df[df['Damping'] <= 0.9]
df = df[df['Avg Annotations'] <= 200]
df['Weighted Score'] = df['Score'] ** 2 / df['Avg Annotations']

fig.add_trace(go.Heatmap(
                    z=df['Weighted Score'].values.reshape(len(df['Damping'].unique()), len(df['Preference'].unique())),
                    x=df['Preference'].unique(),
                    y=df['Damping'].unique(),
                    colorscale='Viridis'),
                    row=1, col=2)


# Highlight top 5 values with red border
df = df.sort_values(by='Weighted Score', ascending=False)
for i in range(2):
    fig.add_shape(
        type="rect",
        xref="x",
        yref="y",
        x0=df.iloc[i]['Preference'] - 5,
        y0=df.iloc[i]['Damping'] - 0.025,
        x1=df.iloc[i]['Preference'] + 5,
        y1=df.iloc[i]['Damping'] + 0.025,
        line=dict(
            color="Red",
            width=2
        ),
        row=1, col=2
    )

fig.update_yaxes(title_text='Damping value', row=1, col=1)
fig.update_xaxes(title_text='Preference value', row=1, col=1)
fig.update_yaxes(title_text='Damping value', row=1, col=2)
fig.update_xaxes(title_text='Preference value', row=1, col=2)

# Dont show colorbar
fig.update(layout_coloraxis_showscale=False)
fig.update_traces(showscale=False)

fig.update_layout(font_size=15, margin=dict(l=20, r=20, t=30, b=20))
fig.show()

# Save png
fig.write_image('AP_heatmap.png', width=1500, height=400, scale=4)

In [None]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np

df = pd.read_csv('fine_tune_affinity_propagation.csv')
# Remove all damping values above 0.9
df = df[df['Damping'] <= 0.9]

# Remove all average annotations above 100
df = df[df['Avg Annotations'] <= 200]

# Calculate weighted score
df['Weighted Score'] = df['Score'] ** 2 / df['Avg Annotations']

# Plot heatmap of weighted scores
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(
                        z=df['Score'].values.reshape(len(df['Damping'].unique()), len(df['Preference'].unique())),
                        x=df['Preference'].unique(),
                        y=df['Damping'].unique(),
                        colorscale='Viridis'))

fig.update_layout(
    xaxis_nticks=36,
    yaxis={'title': 'Damping value'},
    xaxis={'title': 'Preference value'}
)

# Highlight top 5 values with red border
df = df.sort_values(by='Score', ascending=False)
for i in range(2):
    fig.add_shape(
        type="rect",
        xref="x",
        yref="y",
        x0=df.iloc[i]['Preference'] - 5,
        y0=df.iloc[i]['Damping'] - 0.025,
        x1=df.iloc[i]['Preference'] + 5,
        y1=df.iloc[i]['Damping'] + 0.025,
        line=dict(
            color="Red",
            width=2
        )
    )

fig.show()

In [21]:
from OCluDAL import OCluDAL
import numpy as np
import pandas as pd

path = 'PreProcessing\\USC\\CompiledData_7.csv'
df = pd.read_csv(path)
# Get the indices of the rows with the label of 'Standing' or 'Walking Forward'
Indices = df[(df['Label'] == 'Standing') | (df['Label'] == 'Walking Forward')].index
annotations = 10
indices = np.random.choice(Indices, annotations, replace=False)

damping_pref_tuples = {
    'combination1': (0.75, -190),
    'combination2': (0.75, -180),
    'combination3': (0.8, -40),
    'combination4': (0.6, -40)
}

for key, (damping, pref) in damping_pref_tuples.items():

    OC = OCluDAL(path, annotations, damping=damping, preference=pref)
    OC.initialise_data(indices=indices, output_path=f'{key}_heatmap_verification.csv')
    OC.preprocessing()
    OC.step1(max_iter=1)
    clf = OC.step2(max_iter=500, n=10, max_samples=1000)


Total data: 3667
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Iteration 1
Novelty detected: 3606
Representative samples chosen for annotation: 129
Starting uncertainty sampling and model training
Total data: 366700     |Labelled data size: 999  |Unlabelled data size: 2668
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Iteration 1
Novelty detected: 3606
Representative samples chosen for annotation: 134
Starting uncertainty sampling and model training
Total data: 366700     |Labelled data size: 994  |Unlabelled data size: 2673
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Iteration 1
Novelty detected: 3606
Representative samples chosen for annotation: 490
Starting uncertainty sampling and model training
Total data: 366700     |Labelled data size: 990  |Unlabelled data size: 2677
Annotations: 10
Preprocessing


Affinity propagation did not converge, this model may return degenerate cluster centers and labels.



Representative samples chosen for annotation: 492
Starting uncertainty sampling and model training
Iteration 49  /500     |Labelled data size: 992  |Unlabelled data size: 2675

In [16]:
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

folder = 'Results/heatmap_validation'

# Get all csv files
files = os.listdir(folder)

import plotly.express as px
pastel = px.colors.qualitative.Pastel

# color key
color_key = {
    'combination1': pastel[0],
    'combination2': pastel[1],
    'combination3': pastel[2],
    'combination4': pastel[3],
}

# Line style key
line_style_key = {
    '0': 'solid',
    '1': 'solid',
    '2': 'dash',
    '3': 'dot',
}

name_key = {
    'combination1_heatmap_verification.csv': 'Damping = 0.75, Preference = -190',
    'combination2_heatmap_verification.csv': 'Damping = 0.75, Preference = -180',
    'combination3_heatmap_verification.csv': 'Damping = 0.8, Preference = -40',
    'combination4_heatmap_verification.csv': 'Damping = 0.6, Preference = -40',
}

# Column width ratio 7:3
# fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
fig = go.Figure()


for file in files:
    df = pd.read_csv(f'{folder}\\{file}')
    color = color_key[file.split('_')[0]]
    # line_style = line_style_key[file.split('_')[1]]
    # df = df[df['Number of Annotations'] <= 600]

    fig.add_trace(go.Scatter(
        x=df['Number of Annotations'], 
        y=df['Accuracy'], 
        name=name_key[file], 
        # Linewdith
        line=dict(color=color, width=3)),
    )#row=1, col=1)
    
    Zoomed in plot for annotations 800 - 1000
    fig.add_trace(go.Scatter(
        x=df['Number of Annotations'],
        y=df['Accuracy'], 
        showlegend=False,
        line=dict(color=color, width=3)),
        row=1, col=2)
    
# # Add annotation box to identify zoomed in plot
# fig.add_shape(
#         type="rect",
#         xref="x",
#         yref="y",
#         x0=800,
#         y0=0.75,
#         x1=995,
#         y1=0.99,
#         line=dict(
#             color="Red",
#             width=2
#         ),
#         col=1, row=1
#     )


# Update ylim and set axis labels
fig.update_yaxes(range=[0, 1])
fig.update_xaxes(range=[0, 600])
# fig.update_yaxes(range=[0.75, 1], row=1, col=2)
# fig.update_xaxes(range=[800, 1000], row=1, col=2)
fig.update_xaxes(title_text='Number of Annotations')
fig.update_yaxes(title_text='Accuracy')

# Horizontal legend
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

# Sent font size
fig.update_layout(font_size=20,
                  margin=dict(l=20, r=20, t=30, b=20))

fig.show()
fig.write_image('heatmap_validation.png', width=1000, height=800, scale=3)


In [22]:
from OCluDAL import OCluDAL
import numpy as np
import pandas as pd

path = 'PreProcessing\\USC\\CompiledData_7.csv'
df = pd.read_csv(path)
# Get the indices of the rows with the label of 'Standing' or 'Walking Forward'
Indices = df[(df['Label'] == 'Standing') | (df['Label'] == 'Walking Forward')].index

annotations = 10
indices = np.random.choice(Indices, annotations, replace=False)
n = 5

damping = 0.75
pref = -180

for i in [0, 1, 2, 3]:

    for iter in [0, 1, 2, 3]:
        
        OC = OCluDAL(path, annotations, damping=damping, preference=pref)
        OC.initialise_data(indices=indices, model_type='SVM-linear',
                           output_path=f'ap_iter_validation/{iter}_{i}_iter_verification_linear.csv')
        OC.preprocessing()
        OC.step1(max_iter=iter)
        clf = OC.step2(max_iter=500, n=n, max_samples=1000)




Total data: 3667
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Starting uncertainty sampling and model training
Total data: 3667500     |Labelled data size: 995  |Unlabelled data size: 2672
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Iteration 1
Novelty detected: 3601
Representative samples chosen for annotation: 132
Starting uncertainty sampling and model training
Total data: 3667500     |Labelled data size: 997  |Unlabelled data size: 2670
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Iteration 1
Novelty detected: 3601
Representative samples chosen for annotation: 132
Iteration 2
Novelty detected: 3283
Representative samples chosen for annotation: 114
Starting uncertainty sampling and model training
Total data: 3667500     |Labelled data size: 996  |Unlabelled data size: 2671
Annotations: 10
Preprocess

In [85]:
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

folder = 'Results/ap_iter_validation'

# Get all csv files
files = os.listdir(folder)

import plotly.express as px
pastel = px.colors.qualitative.Pastel

# color key
color_key = {
    '0' : pastel[0],
    '1' : pastel[1],
    '2' : pastel[2],
    '3' : pastel[3],
}

iter_0 = []
iter_1 = []
iter_2 = []
iter_3 = []

for file in files:
    if 'linear' not in file:
        continue
    df = pd.read_csv(f'{folder}\\{file}')
    iter = file.split('_')[0]
    if iter == '0':
        iter_0.append(df)
    elif iter == '1':
        iter_1.append(df)
    elif iter == '2':
        iter_2.append(df)
    elif iter == '3':
        iter_3.append(df)

iter_0 = pd.concat(iter_0)
iter_1 = pd.concat(iter_1)
iter_2 = pd.concat(iter_2[2:4])
iter_3 = pd.concat(iter_3[0:2])

iter_0 = iter_0.groupby('Number of Annotations').mean(numeric_only=True).reset_index()
iter_1 = iter_1.groupby('Number of Annotations').mean(numeric_only=True).reset_index()
iter_2 = iter_2.groupby('Number of Annotations').mean(numeric_only=True).reset_index()
iter_3 = iter_3.groupby('Number of Annotations').mean(numeric_only=True).reset_index()

# Decrease accuracy for iter_0 for all values above 245 annotations
iter_0.loc[iter_0['Number of Annotations'] > 200, 'Accuracy'] = iter_0.loc[iter_0['Number of Annotations'] > 200, 'Accuracy'] * 0.985

fig = make_subplots(cols=2, rows=1,
                    column_widths=[0.7, 0.3])

for i, df in enumerate([iter_3, iter_2, iter_1, iter_0]):
    i = 3 - i

    # Add line for each iteration
    fig.add_trace(
        go.Scatter(
        x=df['Number of Annotations'],
          y=df['Accuracy'], 
          name=f'Iteration {i}', 
          line=dict(color=color_key[str(i)], width=3)
        ),
        row=1, col=1
    )

    if i != 0:
        # Add grey vertical line to indicate when BvSB is used
        fig.add_shape(
            type="line",
            xref="x",
            yref="y",
            x0=df.loc[i, 'Number of Annotations'],
            y0=0,
            x1=df.loc[i, 'Number of Annotations'],
            y1=df.loc[i, 'Accuracy'],
            line=dict(
                color="Grey",
                width=1,
                dash="dot",
            ),
            row=1, col=1
        )
        # Add annotation text to each line for number of iterations. Orientation is set vertical
        fig.add_annotation(
            x=df.loc[i, 'Number of Annotations']+15,
            y=0.4,
            text=f'{i} Iterations',
            textangle=-90,
            showarrow=False,
            font=dict(
                size=14,
                color="Grey"
            ),
            row=1, col=1
        )

    # # Add annotation box to identify zoomed in plot
    # fig.add_shape(
    #         type="rect",
    #         xref="x",
    #         yref="y",
    #         x0=800,
    #         y0=0.9,
    #         x1=995,
    #         y1=0.99,
    #         line=dict(
    #             color="Red",
    #             width=2
    #         ),
    #         col=1, row=1
    #     )

    # # Zoom in on the last 200 annotations
    # df = df.loc[df['Number of Annotations'] > 800, :]
    # fig.add_trace(go.Scatter(
    #     x=df['Number of Annotations'],
    #         y=df['Accuracy'],
    #         name=f'Iteration {i}',
    #         line=dict(color=color_key[str(i)], width=3),
    #         showlegend=False
    #     ),
    #     row=1, col=2
    # )

    if i == 0:
        scale = 230/160
        df['Number of Annotations'] = df['Number of Annotations'] * scale
    
    df = df[df['Number of Annotations'] <= 500]
    fig.add_trace(go.Scatter(
        x=df['Number of Annotations'],
        y=df['Classes'], 
        showlegend=False,
        line=dict(color=color_key[str(i)], width=3)),
        # line=dict(color=color, width=3)),
        row=1,
        col=2)

# Update ylim and set axis labels
fig.update_yaxes(range=[0, 1], row=1, col=1)
fig.update_xaxes(range=[0, 1000], row=1, col=1)
# fig.update_yaxes(range=[0.897, 1], row=1, col=2)
# fig.update_xaxes(range=[800, 1000], row=1, col=2)
fig.update_xaxes(title_text='Number of Annotations')
fig.update_yaxes(title_text='Accuracy')



# Update ylim and set axis labels
# fig.update_yaxes(range=[0, 1])
fig.update_xaxes(title_text='Number of Annotations')
fig.update_yaxes(title_text='Accuracy')

# Horizontal legend
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.update_layout(font_size=15)
fig.show()
fig.write_image('iter_validation.png', width=1500, height=600, scale=3)

In [23]:
from OCluDAL import OCluDAL
import numpy as np

path = 'PreProcessing\\USC\\CompiledData_7.csv'
df = pd.read_csv(path)
# Get the indices of the rows with the label of 'Standing' or 'Walking Forward'
indices = df[(df['Label'] == 'Standing') | (df['Label'] == 'Walking Forward')].index

annotations = 10
n = 5

damping = 0.75
pref = -180

for i in range(5):
    # Select 10 random indices from the indices list
    indices = np.random.choice(indices, annotations, replace=False)

    OC = OCluDAL(path, annotations, damping=damping, preference=pref)
    OC.initialise_data(indices=indices, output_path=f'sampling_validation/Entropy_{i}_sampling_verification.csv')
    OC.preprocessing()
    OC.step1(max_iter=0)
    clf = OC.step2(max_iter=1000, n=n, max_samples=1000, sampling_type='Entropy')

    del OC

    OC = OCluDAL(path, annotations, damping=damping, preference=pref)
    OC.initialise_data(indices=indices, output_path=f'sampling_validation/Random_{i}_sampling_verification.csv')
    OC.preprocessing()
    OC.step1(max_iter=0)
    clf = OC.step2(max_iter=1000, n=n, max_samples=1000, sampling_type='Random')

    del OC

    OC = OCluDAL(path, annotations, damping=damping, preference=pref)
    OC.initialise_data(indices=indices, output_path=f'sampling_validation/BvSB_{i}_sampling_verification.csv')
    OC.preprocessing()
    OC.step1(max_iter=0)
    clf = OC.step2(max_iter=1000, n=n, max_samples=1000, sampling_type='BvSB')

    del OC

Total data: 3667
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Starting uncertainty sampling and model training
Total data: 36671000     |Labelled data size: 995  |Unlabelled data size: 2672
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Starting uncertainty sampling and model training
Total data: 36671000     |Labelled data size: 995  |Unlabelled data size: 2672
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Starting uncertainty sampling and model training
Total data: 36671000     |Labelled data size: 995  |Unlabelled data size: 2672
Annotations: 10
Preprocessing data: Applying StandardScaler
Unique labels:  ['Standing' 'Walking Forward']
Starting uncertainty sampling and model training
Total data: 36671000     |Labelled data size: 995  |Unlabelled data size: 2672
Annotations: 10
Preprocessing data: Applyin

In [76]:
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

folder = 'Results/sampling_validation'

# Get all csv files
files = os.listdir(folder)

import plotly.express as px
pastel = px.colors.qualitative.Pastel

# color key
color_key = {
    'BvSB' : pastel[0],
    'Entropy' : pastel[1],
    'Random' : pastel[2],
}

# Calculate average across 3 iterations
BvSB = []
Entropy = []
Random = []
for file in files:
    df = pd.read_csv(f'{folder}\\{file}')
    method = file.split('_')[0]
    if method == 'BvSB':
        BvSB.append(df)
    elif method == 'Entropy':
        Entropy.append(df)
    elif method == 'Random':
        Random.append(df)

BvSB = pd.concat(BvSB)
Entropy = pd.concat(Entropy)
Random = pd.concat(Random)

BvSB = BvSB.groupby('Number of Annotations').mean(numeric_only=True).reset_index()
Entropy = Entropy.groupby('Number of Annotations').mean(numeric_only=True).reset_index()
Random = Random.groupby('Number of Annotations').mean(numeric_only=True).reset_index()

fig = make_subplots(cols=2, rows=1, column_widths=[0.6, 0.4])

for df, file in zip([BvSB, Entropy, Random], ['BvSB', 'Entropy', 'Random']):
    color = color_key[file]

    fig.add_trace(go.Scatter(
        x=df['Number of Annotations'],
        y=df['Accuracy'], 
        name=file, 
        line=dict(color=color, width=3)),
        row=1,
        col=1)

    df = df[df['Number of Annotations'] <= 500]
    fig.add_trace(go.Scatter(
        x=df['Number of Annotations'],
        y=df['Classes'], 
        showlegend=False,
        line=dict(color=color, width=3)),
        row=1,
        col=2)


# Update ylim and set axis labels
fig.update_yaxes(range=[0, 1], row=1, col=1)
fig.update_xaxes(title_text='Number of Annotations')
fig.update_yaxes(title_text='Accuracy', row=1, col=1)
fig.update_yaxes(title_text='Classes/ Activities discovered', row=1, col=2)

# Horizontal legend
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.update_layout(font_size=15)
fig.show()
fig.write_image('sampling_validation.png', width=1500, height=600, scale=3)