In [69]:
import os
import pandas as pd
import pathlib
metrics_pretty_names = {
    'accuracy': 'Accuracy',
    'average_total_throughput': 'Average Total Throughput',
    'standard_deviation_total_throughput': 'Standard Deviation Total Throughput',
    'tpr': 'TPR',
    'fpr': 'FPR',
    'average_throughput': 'Average Throughput',
    'standard_deviation_throughput': 'Standard Deviation Throughput',
    'model': 'Model'
    
}

MODELS = ['Random Forest','Decision Tree','MLP','Adaboost', 'Naive Bayes', 'Quadratic Discriminant Analysis','Linear Discriminant Analysis','Logistic Regression','RNN','GRU','Transformer', "Transformer*"]
pathlib.Path('views/').mkdir(parents=True, exist_ok=True) 
# Load the data
df = pd.read_json('data/results.json', lines=True)
df_fids = pd.read_json('data/results_fids.json', lines=True)
df = pd.concat([df, df_fids], axis=0)
df= df[df['model'].isin(MODELS)]
def percentage_transform(df,column):
    df[column]=(df[column]*100).map(lambda x: f'{x:.2f}\\%')
percentage_transform(df,'accuracy')
percentage_transform(df,'tpr')
percentage_transform(df,'fpr')
# Ensure the output_date is in datetime format
df['output_date'] = pd.to_datetime(df['output_date'])

# Sort the DataFrame by output_date to get the latest entries for each model
df_sorted = df.sort_values(by='output_date', ascending=False)

# Filter to get the latest result for each model and number of packets combination
df_latest = df_sorted.drop_duplicates(subset=['model', 'number_packets'], keep='first')

# Separate the results_fids models from the others
df_fids_models = df_latest[df_latest['model'].isin(df_fids['model'])]
df_other_models = df_latest[~df_latest['model'].isin(df_fids['model'])]

# Concatenate the fids models at the top
df_latest = pd.concat([df_fids_models, df_other_models], axis=0)


In [70]:
def is_numeric(obj):
    attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
    return all(hasattr(obj, attr) for attr in attrs)


In [71]:
import inflect
inflect_engine = inflect.engine()

In [72]:
with open('views/variables.tex','w') as f:
    df_rounded = df_latest.round(1)
    for k,v in df_rounded.iterrows():
        for i in ['accuracy','tpr','fpr','average_total_throughput']:
            np=v['number_packets']
            if is_numeric(np):
                np=inflect_engine.number_to_words(np)
            words=(v['model'],i,np)
            words=map(str,words)
            var_name = ''.join(map(str.capitalize, words))
            
            var_name =var_name.replace('*','star').replace(' ','').replace('_','')
            f.write(r'\newcommand{\%s}{%s\xspace}'%(var_name,v[i]))
            f.write('\n')

In [73]:

import constants
# Define the function to generate LaTeX table and save to a file
def generate_latex_table(df, metric, file_path):
    # Pivot the DataFrame to have model as the index and number_packets as columns
    # print(df)
    npoptions=constants.NUMBER_PACKETS_OPTIONS + ['full']
    df_pivot = df.pivot(index='model', columns='number_packets', values=metric).reindex(index=df['model'].unique(), columns=npoptions)
    df_pivot.reset_index(inplace=True)
    # Define the columns for the table
    columns = ['model'] + [col for col in df_pivot.columns if col not in ['model', 'confusion_matrix']]
    columns_packet_numbers = df_pivot.columns[1:]
    columns_packet_numbers = list(columns_packet_numbers)
    columns_packet_numbers.remove('full')
    
    # Extract the packet numbers for the caption
    packet_numbers = ", ".join(map(str, columns_packet_numbers))  # Exclude 'Model' column
    df_pivot = pd.concat([df_pivot[df_pivot['model']==model] for model in MODELS],axis=0)
    print(df_pivot)
    # LaTeX table generation function with multi-row header
    def df_to_latex_pivot(df, columns):
        columns_str = list(map(str, columns))
        # Number of packets columns
        num_packet_cols = columns_str[1:]
        num_packet_cols[-1] = 'Complete Flows'
        
        latex_table = "\\begin{table*}\n\\centering\n"+r'\begin{adjustbox}{width=\textwidth}'+"\n\\begin{tabular}{|c|" + "c|" * len(num_packet_cols) + "}\n\\hline\n"
        
        # Multi-row header
        latex_table += "\\multirow{2}{*}{Model} & \\multicolumn{" + str(len(num_packet_cols)) + "}{c|}{Number of Packets} \\\\\n\\cline{2-" + str(len(columns)) + "}\n"
        latex_table += " & " + " & ".join(num_packet_cols) + " \\\\\n\\hline\n"
        
        for _, row in df.iterrows():
            latex_table += " & ".join([f"{row[col]:.3f}" if isinstance(row[col], float) else str(row[col]) for col in columns]) + " \\\\\n"
        
        latex_table += "\\hline\n\\end{tabular}\n"+r'\end{adjustbox}'+"\n\\caption{" + metrics_pretty_names[metric] + " by Model for Flows with at Most " + packet_numbers + " Packets, and Complete Flows}\n\\label{tab:" + metric.lower().replace(" ", "_") + "_results}\n\\end{table*}"
        return latex_table

    # Generate the LaTeX table
    latex_code = df_to_latex_pivot(df_pivot, columns)
    
    # Save the LaTeX table to a file
    with open(file_path, 'w') as f:
        f.write(latex_code)

# Identify all metric columns (excluding non-metric columns)
non_metric_columns = ['model', 'number_packets', 'output_date', 'best_parameters', 'timeout']
metric_columns = ['accuracy', 'average_total_throughput', 'standard_deviation_total_throughput']

# Generate LaTeX tables for all metrics and save to files
for metric in metric_columns:
    file_path = f'views/{metric.lower().replace(" ", "_").replace("/", "_")}_table.tex'
    generate_latex_table(df_latest, metric, file_path)

# Load the data
# df = pd.read_json('data/results.json', lines=True)
# df_fids = pd.read_json('data/results_fids.json', lines=True)
# df = pd.concat([df, df_fids], axis=0)

# # Ensure the output_date is in datetime format
# df['output_date'] = pd.to_datetime(df['output_date'])

# Filter the DataFrame to include only results for 'full' number of packets
df_full_packets = df_latest[df_latest['number_packets'] == 'full']

# # Sort the DataFrame by output_date to get the latest entries for each model
# df_sorted = df_full_packets.sort_values(by='output_date', ascending=False)

# # Filter to get the latest result for each model and number of packets combination
# df_latest_full = df_sorted.drop_duplicates(subset=['model'], keep='first')

# # Separate the results_fids models from the others
# df_fids_full_models = df_latest_full[df_latest_full['model'].isin(df_fids['model'])]
# df_other_full_models = df_latest_full[~df_latest_full['model'].isin(df_fids['model'])]

# # Concatenate the fids models at the top
# df_latest_full = pd.concat([df_fids_full_models, df_other_full_models], axis=0)

# Define the function to generate LaTeX table and save to a file
def generate_latex_table_full(df, file_path):
    # Define the columns for the table
    # excluded_columns = [
    #     'model', 'number_packets', 'output_date', 'best_parameters', 'timeout', 
    #     'confusion_matrix', '', 'precision', 'recall', 'confusion_matrix', 'Average Prediction Time (s/sample)', 
    #     'Standard Deviation Prediction Time (s/sample)', 
    #     'Average Throughput (samples/s)', 
    #     'Standard Deviation Throughput (samples/s)'
    # ]

    # columns = ['model'] + [col for col in df.columns if col not in excluded_columns]
    columns= ['model','accuracy','tpr','fpr',"average_throughput","average_total_throughput"]
    # LaTeX table generation function
    def df_to_latex_pivot(df, columns):
        columns_str = list(map(str, columns))
        latex_table = "\\begin{table*}\n\\centering\n"+r'\begin{adjustbox}{width=\textwidth}'+"\n\\begin{tabular}{|" + " | ".join(["c"] * len(columns)) + "|}\n\\hline\n"
        latex_table += " & ".join(map(lambda x : metrics_pretty_names[x], columns_str)) + " \\\\\n\\hline\n"
        for _, row in df.iterrows():
            latex_table += " & ".join([f"{row[col]:.4f}" if isinstance(row[col], float) else str(row[col]) for col in columns]) + " \\\\\n"

        latex_table += "\\hline\n\\end{tabular}"+r"\end{adjustbox}"+"\n\\caption{Results across multiple metrics with all complete flows.}\n\\label{tab:full_packets_results}\n\\end{table*}"
        return latex_table
    df = pd.concat([df[df['model']==model] for model in MODELS],axis=0)
    # Generate the LaTeX table
    latex_code = df_to_latex_pivot(df, columns)
    
    # Save the LaTeX table to a file
    with open(file_path, 'w') as f:
        f.write(latex_code)

# Define the file path for the LaTeX table
file_path = 'views/full_packets_results.tex'

# Generate LaTeX table for full number of packets and save to a file
generate_latex_table_full(df_full_packets, file_path)

number_packets                            model        2        6       20  \
4                                 Random Forest   9.51\%  38.71\%  70.27\%   
5                                 Decision Tree   9.40\%  30.23\%  48.40\%   
6                                           MLP   8.33\%  12.04\%  12.58\%   
7                                      Adaboost   8.33\%  15.94\%  23.05\%   
8                                   Naive Bayes   8.33\%  12.06\%  12.05\%   
9               Quadratic Discriminant Analysis  16.72\%  33.66\%  45.59\%   
10                 Linear Discriminant Analysis  14.62\%  30.71\%  49.60\%   
11                          Logistic Regression   8.34\%   8.32\%   9.45\%   
1                                           RNN  33.20\%  71.20\%  72.47\%   
2                                           GRU  30.07\%  70.51\%  72.54\%   
3                                   Transformer  43.11\%  80.73\%  97.50\%   
0                                  Transformer*  34.14\%  84.22\