In [None]:
import plotly.graph_objs as go
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
EVALPATHS = [
    r"../data/all-MiniLM-L12-v2_results.parquet.gzip",
    r"../data/all-mpnet-base-v2_results.parquet.gzip",
    r"../data/allenai-specter_results.parquet.gzip",
    r"../data/multi-qa-MiniLM-L6-cos-v1_results.parquet.gzip",
    ]

EVALPATHS_RS50 = [
    r"../data/Evaluation_Shuffled_Removed_50/rs50_all-MiniLM-L12-v2_results.parquet.gzip",
    r"../data/Evaluation_Shuffled_Removed_50/rs50_all-mpnet-base-v2_results.parquet.gzip",
    r"../data/Evaluation_Shuffled_Removed_50/rs50_allenai-specter_results.parquet.gzip",
    r"../data/Evaluation_Shuffled_Removed_50/rs50_multi-qa-MiniLM-L6-cos-v1_results.parquet.gzip",
    ]

EVALPATHS_RS75 = [
    r"../data/Evaluation_Shuffled_Removed_75/rs75_all-MiniLM-L12-v2_results.parquet.gzip",
    r"../data/Evaluation_Shuffled_Removed_75/rs75_all-mpnet-base-v2_results.parquet.gzip",
    r"../data/Evaluation_Shuffled_Removed_75/rs75_allenai-specter_results.parquet.gzip",
    r"../data/Evaluation_Shuffled_Removed_75/rs75_multi-qa-MiniLM-L6-cos-v1_results.parquet.gzip",
    ]

names = [
    "MiniLM-L12-v2",
    "mpnet-base-v2",
    "allenai-specter",
    "multi-qa-MiniLM-L6-cos-v1",
    ]


In [None]:
dfs = [pd.read_parquet(p) for p in EVALPATHS]
dfs_rs50 = [pd.read_parquet(p) for p in EVALPATHS_RS50]
dfs_rs75 = [pd.read_parquet(p) for p in EVALPATHS_RS75]

In [None]:
len(dfs), dfs[0].shape

## Original Data evaluation

In [None]:
print(dfs[0].columns)
dfs[0].head()

In [None]:
avg_sim_scores = [df["sim_score"].mean() for df in dfs]
med_sim_scores = [df["sim_score"].median() for df in dfs]

for i, name in enumerate(names):
    print(30*"_")
    print(45*"-")
    print(name)
    print(f"Average: {avg_sim_scores[i]}")
    print(f"Median: {med_sim_scores[i]}")


In [None]:
founds = [df['found_n'].value_counts().sort_index() for df in dfs]


fig = go.Figure()

for i, series in enumerate(founds):
    fig.add_trace(go.Scatter(x=series.index, y=series.values, mode='lines+markers', name=names[i]))

# Update layout
fig.update_layout(
    title='',
    xaxis_title='Rank',
    yaxis_title='Amount ',
    yaxis=dict(type='log'),
    width=1000,
    height=600,
    template='plotly_dark'
).show()

In [None]:
for i, df in enumerate(dfs):
    df['model'] = names[i]

combined_df = pd.concat(dfs, ignore_index=True)

In [None]:
box_fig = go.Figure()

for sample in combined_df['model'].unique():
    sample_data = combined_df[combined_df['model'] == sample]
    box_fig.add_trace(go.Box(y=sample_data['sim_score'], name=sample))

box_fig.update_layout(
    # title='Box Plot of Similarity Scores',
    xaxis=dict(showticklabels=False),
    yaxis_title='Similarity Score',
    width=800,
    height=500,
    template='plotly_dark'

)

box_fig.show()

In [None]:
violin_fig = go.Figure()

for sample in combined_df['model'].unique():
    sample_data = combined_df[combined_df['model'] == sample]
    violin_fig.add_trace(go.Violin(y=sample_data['sim_score'], name=sample, box_visible=True, meanline_visible=True))

violin_fig.update_layout(
    title='Violin Plot of Similarity Scores',
    xaxis=dict(showticklabels=False),
    yaxis_title='Similarity Score',
    width=800,
    height=500,
    template='plotly_dark'
)

violin_fig.show()

In [None]:
avg_sim_scores = combined_df.groupby(['update_year', 'model'])['sim_score'].mean().reset_index()

# Create the line plot for each model
fig = go.Figure()

for model in avg_sim_scores['model'].unique():
    model_data = avg_sim_scores[avg_sim_scores['model'] == model]
    fig.add_trace(go.Scatter(
        x=model_data['update_year'], y=model_data['sim_score'],
        mode='lines+markers', name=model
    ))

# Update layout
fig.update_layout(
    title='Average Similarity Scores Over Time by Model',
    xaxis_title='Update Year',
    yaxis_title='Average Similarity Score',
    template='plotly_dark'
)

# Show the plot
fig.show()

In [None]:
avg_sim_scores = combined_df.groupby(['super_category', 'model'])['sim_score'].mean().reset_index()

# Create the line plot for each model
fig = go.Figure()

for model in avg_sim_scores['model'].unique():
    model_data = avg_sim_scores[avg_sim_scores['model'] == model]
    fig.add_trace(go.Scatter(
        x=model_data['super_category'], y=model_data['sim_score'],
        mode='lines+markers', name=model
    ))

# Update layout
fig.update_layout(
    title='Average Similarity Scores Over Time by Model',
    xaxis_title='super_category',
    yaxis_title='Average Similarity Score',
    yaxis=dict(type='log'),
    template='plotly_dark'
)

# Show the plot
fig.show()

In [None]:
import plotly.graph_objs as go
import pandas as pd
from plotly.subplots import make_subplots

# Add a 'Model' and 'Masking' column to each dataframe
for i, df in enumerate(dfs):
    df['Model'] = names[i]
    df['Masking'] = 'Original'

for i, df in enumerate(dfs_rs50):
    df['Model'] = names[i]
    df['Masking'] = '50% Masking + Shuffling'

for i, df in enumerate(dfs_rs75):
    df['Model'] = names[i]
    df['Masking'] = '75% Masking + Shuffling'

# Combine all dataframes into a single dataframe
combined_df = pd.concat(dfs + dfs_rs50 + dfs_rs75, ignore_index=True)

# Create subplots: one row per model
num_models = len(names)
subplot_titles = [f'{name}' for name in names]


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

masking_types = ['Original', '50% Masking + Shuffling', '75% Masking + Shuffling']
fig = make_subplots(rows=3, cols=1, subplot_titles=['Original', '50% Masking + Shuffling', '75% Masking + Shuffling'])

colors = ['#636EFA','#EF553B','#00CC96','#FFA15A']

colors = {
    names[i]: colors[i] for i in range(len(names))
}

print(colors)
for i, masktype in enumerate(masking_types, start=1):
    for model in names:
        data = combined_df[(combined_df['Model'] == model) & (combined_df['Masking'] == masktype)]
        fig.add_trace(
            go.Scatter(
                x=data['found_n'].value_counts().sort_index().index,
                y=data['found_n'].value_counts().sort_index().values,
                mode='lines+markers',
                name=model,
                marker_color=colors[model],
                showlegend=(i == 1)  # Show legend only for the first plot
            ),
            row=i, col=1
        )

fig.update_layout(
    yaxis=dict(type='log', title='Amount', range=[0, 5]),
    xaxis=dict(title='Rank'),
    yaxis2=dict(type='log', title='Amount', range=[0, 5]),
    xaxis2=dict(title='Rank'),
    yaxis3=dict(type='log', title='Amount', range=[0, 5]),
    xaxis3=dict(title='Rank'),
    template='plotly_dark',
    height=1500,
    width=1000,
    margin=dict(l=20, r=20, t=100, b=20),
)

fig.show()


In [None]:
# 1. Distribution of Similarity Scores (Box Plot and Violin Plot)
# Box Plot
box_fig = make_subplots(rows=2, cols=2, subplot_titles=subplot_titles)

for i, model in enumerate(names):
    no_masking_data = combined_df[(combined_df['Model'] == model) & (combined_df['Masking'] == 'Original')]
    masking50_data = combined_df[(combined_df['Model'] == model) & (combined_df['Masking'] == '50% Masking + Shuffling')]
    masking75_data = combined_df[(combined_df['Model'] == model) & (combined_df['Masking'] == '75% Masking + Shuffling')]

    box_fig.add_trace(go.Box(y=no_masking_data['sim_score'], name='No Masking', showlegend=False), row=i // 2 + 1, col=i % 2 + 1)
    box_fig.add_trace(go.Box(y=masking50_data['sim_score'], name='50% Masking + Shuffling', showlegend=False), row=i // 2 + 1, col=i % 2 + 1)
    box_fig.add_trace(go.Box(y=masking75_data['sim_score'], name='75% Masking + Shuffling', showlegend=False), row=i // 2 + 1, col=i % 2 + 1)




box_fig.update_layout(
    title='Box Plot of Similarity Scores',
    # xaxis_title='Masking',
    yaxis_title='Similarity Score',
    template='plotly_dark',
    height=800,
    showlegend=False

)

box_fig.show()


In [None]:

# Violin Plot
violin_fig = make_subplots(rows=2, cols=2, subplot_titles=subplot_titles)

for i, model in enumerate(names):
    no_masking_data = combined_df[(combined_df['Model'] == model) & (combined_df['Masking'] == 'Original')]
    masking50_data = combined_df[(combined_df['Model'] == model) & (combined_df['Masking'] == '50% Masking + Shuffling')]
    masking75_data = combined_df[(combined_df['Model'] == model) & (combined_df['Masking'] == '75% Masking + Shuffling')]

    violin_fig.add_trace(go.Violin(y=no_masking_data['sim_score'], name='No Masking', box_visible=True, meanline_visible=True, showlegend=False), row=i // 2 + 1, col=i % 2 + 1)
    violin_fig.add_trace(go.Violin(y=masking50_data['sim_score'], name='50% Masking + Shuffling', box_visible=True, meanline_visible=True, showlegend=False), row=i // 2 + 1, col=i % 2 + 1)
    violin_fig.add_trace(go.Violin(y=masking75_data['sim_score'], name='75% Masking + Shuffling', box_visible=True, meanline_visible=True, showlegend=False), row=i // 2 + 1, col=i % 2 + 1)

violin_fig.update_layout(
    title='Violin Plot of Similarity Scores',
    xaxis_title='Masking',
    yaxis_title='Similarity Score',
    template='plotly_dark',
    height=800
)

violin_fig.show()


In [None]:

# 2. Trend of Average Similarity Scores Over Time (Line Plot)
avg_sim_scores = combined_df.groupby(['update_year', 'Model', 'Masking'])['sim_score'].mean().reset_index()
display(avg_sim_scores)


In [None]:
line_fig = make_subplots(rows=2, cols=2, subplot_titles=subplot_titles)

colors = {
    'Original': '#636EFA',
    '50% Masking + Shuffling': '#EF553B',
    '75% Masking + Shuffling': '#00CC96'
}

for i, model in enumerate(names):
    no_masking_data = avg_sim_scores[(avg_sim_scores['Model'] == model) & (avg_sim_scores['Masking'] == 'Original')]
    masking50_data = avg_sim_scores[(avg_sim_scores['Model'] == model) & (avg_sim_scores['Masking'] == '50% Masking + Shuffling')]
    masking75_data = avg_sim_scores[(avg_sim_scores['Model'] == model) & (avg_sim_scores['Masking'] == '75% Masking + Shuffling')]

    line_fig.add_trace(go.Scatter(
        x=no_masking_data['update_year'], y=no_masking_data['sim_score'],
        mode='lines+markers', name='No Masking', marker_color=colors['Original'],
        showlegend=i == 0
    ), row=i // 2 + 1, col=i % 2 + 1)
    line_fig.add_trace(go.Scatter(
        x=masking50_data['update_year'], y=masking50_data['sim_score'],
        mode='lines+markers', name='50% Masking + Shuffling', marker_color=colors['50% Masking + Shuffling'],
        showlegend=i == 0
    ), row=i // 2 + 1, col=i % 2 + 1)
    line_fig.add_trace(go.Scatter(
        x=masking75_data['update_year'], y=masking75_data['sim_score'],
        mode='lines+markers', name='75% Masking + Shuffling', marker_color=colors['75% Masking + Shuffling'],
        showlegend=i == 0
    ), row=i // 2 + 1, col=i % 2 + 1)


line_fig.update_layout(
    title=f'Average Similarity Scores per Year - {model}',
    xaxis_title='Update Year',
    yaxis_title='Average Similarity Score',
    template='plotly_dark',
    height=800,

)
line_fig.show()






In [None]:
colors = {
    'Original': '#636EFA',
    '50% Masking + Shuffling': '#EF553B',
    '75% Masking + Shuffling': '#00CC96'
}

# 3. Category-wise Comparison (Bar Plot)
category_avg_scores = combined_df.groupby(['super_category', 'Model', 'Masking'])['sim_score'].mean().reset_index()

bar_fig = make_subplots(rows=2, cols=2, subplot_titles=names)

for i, model in enumerate(names):
    no_masking_data = category_avg_scores[(category_avg_scores['Model'] == model) & (category_avg_scores['Masking'] == 'Original')]
    masking50_data = category_avg_scores[(category_avg_scores['Model'] == model) & (category_avg_scores['Masking'] == '50% Masking + Shuffling')]
    masking75_data = category_avg_scores[(category_avg_scores['Model'] == model) & (category_avg_scores['Masking'] == '75% Masking + Shuffling')]

    bar_fig.add_trace(go.Bar(
        x=no_masking_data['super_category'], y=no_masking_data['sim_score'],
        name='Original',
        marker_color=colors['Original'],
        showlegend=i == 0
    ), row=i // 2 + 1, col=i % 2 + 1)

    bar_fig.add_trace(go.Bar(
        x=masking50_data['super_category'], y=masking50_data['sim_score'],
        name='50% Masking + Shuffling',
        marker_color=colors['50% Masking + Shuffling'],
        showlegend=i == 0
    ), row=i // 2 + 1, col=i % 2 + 1)

    bar_fig.add_trace(go.Bar(
        x=masking75_data['super_category'], y=masking75_data['sim_score'],
        name='75% Masking + Shuffling',
        marker_color=colors['75% Masking + Shuffling'],
        showlegend=i == 0
    ), row=i // 2 + 1, col=i % 2 + 1)

bar_fig.update_layout(
    title='Average Similarity Scores per Category',
    xaxis_title='Category',
    yaxis_title='Average Similarity Score',
    template='plotly_dark',
    height=800,
    barmode='group'
)

bar_fig.show()