In [1]:
# GENERIC FIRST CELL FOR DEVELOPING A NEW METHODOLOGY

# Define an expressive name for the methodology, that will be used to create the output directory
# The complete path to the output directory will be provided in OUT_DIR
METHODOLOGY_NAME='poslog/'

# Set to number of directories below project root if the notebook is in a subdirectory of the project, so you can use relative paths
SUBDIR_LEVEL = 1
if SUBDIR_LEVEL>0:
    import sys 
    new_path = '../'*SUBDIR_LEVEL
    if new_path not in sys.path:
        sys.path.append(new_path)

import os
OUT_DIR = os.path.relpath(os.path.join(os.getcwd(), '../'*SUBDIR_LEVEL, 'out', METHODOLOGY_NAME))
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
print(f"Current output directory: '{OUT_DIR}'")

%load_ext autoreload
%autoreload 2

Current output directory: '../out/poslog'


In [4]:
N=10000
SEED=42

OUTPUT_FILE=os.path.join(OUT_DIR, '1_examples_'+str(N)+'_each_seed-'+str(SEED)+'_numb_var.csv')

# Files for each dataset with all templates for NumbVar
OUTPUT_FILE_ALL_TEMPLATES='numb_var_templates_{dataset}.txt'
# Files for each 10k sub-dataset 
OUTPUT_FILE_N_LINES='selection_{dataset}_{N}_each_seed-{SEED}.pkl'

INPUT_DIR = '../'*SUBDIR_LEVEL+'data/'

print(f'Input directory: {INPUT_DIR}')
print(f'Output file: {OUTPUT_FILE}')

Input directory: ../data/
Output file: ../out/poslog/1_examples_10000_each_seed-42_numb_var.csv


# Collect Datasets to one File each

In [None]:
def collect_all_log_files_in_folder(input_dir:str, collection_file:str, suffix:str='.log', exclude:list[str]=[])->None:
    # make sure not to collect collection file itself
    basename = os.path.basename(collection_file)
    if not basename in exclude:
        exclude.append(basename)

    # list all files alphabetically
    for file in sorted(os.listdir(input_dir)):
        # if is log file
        if file.endswith(suffix):
            if file in exclude:
                continue
            with open(collection_file, "a") as output_file:
                with open(os.path.join(input_dir, file), "r") as input_file:
                    output_file.write(input_file.read())
                    
def collect_all_log_files_in_subfolders(input_dir:str, collection_file:str, suffix:str='.log')->None:
    #for each folder in the directory collect all files of type .log to one huge file
    for folder in sorted(os.listdir(input_dir)):
        subfolder=os.path.join(input_dir, folder)
        # if is folder open and list all files
        if os.path.isdir(subfolder):
            collect_all_log_files_in_folder(subfolder, collection_file, suffix)


In [None]:
# Collect Hadoop dataset
collected_file=os.path.join(INPUT_DIR,'Hadoop/Hadoop_collected.log')
dir_with_logs=os.path.join(INPUT_DIR,'Hadoop/')

# skip if file exists
if os.path.exists(collected_file):
    print(f"File {collected_file} already exists, skipping")
else:
    collect_all_log_files_in_subfolders(dir_with_logs, collected_file)
    print(f"Collected all log files in {dir_with_logs} to {collected_file}")

In [None]:
# Collect Spark dataset
collected_file=os.path.join(INPUT_DIR,'Spark/Spark_collected.log')
dir_with_logs=os.path.join(INPUT_DIR,'Spark/')

# skip if file exists
if os.path.exists(collected_file):
    print(f"File {collected_file} already exists, skipping")
else:
    collect_all_log_files_in_subfolders(dir_with_logs, collected_file)
    print(f"Collected all log files in {dir_with_logs} to {collected_file}")

In [None]:
# Collect OpenStack dataset
collected_file=os.path.join(INPUT_DIR,'OpenStack/OpenStack_collected.log')
dir_with_logs=os.path.join(INPUT_DIR,'OpenStack/')

# skip if file exists
if os.path.exists(collected_file):
    print(f"File {collected_file} already exists, skipping")
else:
    collect_all_log_files_in_folder(dir_with_logs, collected_file)
    print(f"Collected all log files in {dir_with_logs} to {collected_file}")

# Collect all Templates with NumbVar

In [None]:
config = {
    'HDFS': {
#        'log_file': 'HDFS/HDFS_2k.log',
        "log_file": "HDFS_v1/HDFS.log",
        'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>',
        'regex': [r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?'],
        "lines": 11175629,
        },

    'Hadoop': {
#        'log_file': 'Hadoop/Hadoop_2k.log',
        "log_file": "Hadoop/Hadoop_collected.log",
        'log_format': '<Date> <Time> <Level> \[<Process>\] <Component>: <Content>',
        'regex': [r'(\d+\.){3}\d+', r'SUCCESS_ CONTAINER_ CLEANUP'],
        "lines": 394308,
        },

    'Spark': {
#        'log_file': 'Spark/Spark_2k.log',
        "log_file": "Spark/Spark_collected.log",
        'log_format': '<Date> <Time> <Level> <Component>: <Content>',
        'regex': [r'(\d+\.){3}\d+', r'\b[KGTM]?B\b', r'([\w-]+\.){2,}[\w-]+'],
        "lines": 33236604,
        },

    'Zookeeper': {
#        'log_file': 'Zookeeper/Zookeeper_2k.log',
        "log_file": "Zookeeper/Zookeeper.log",
        'log_format': '<Date> <Time> - <Level>  \[<Node>:<Component>@<Id>\] - <Content>',
        'regex': [r'(/|)(\d+\.){3}\d+(:\d+)?'],
        "lines": 74380,
        },

    'BGL': {
#        'log_file': 'BGL/BGL_2k.log',
        "log_file": "BGL/BGL.log",
        'log_format': '<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>',
        'regex': [r'core\.\d+'],
        "lines": 4747963,
        },

    'HPC': {
#        'log_file': 'HPC/HPC_2k.log',
        "log_file": "HPC/HPC.log",
        'log_format': '<LogId> <Node> <Component> <State> <Time> <Flag> <Content>',
        'regex': [],
        "lines": 433490,
        },

    'Thunderbird': {
#        'log_file': 'Thunderbird/Thunderbird_2k.log',
        "log_file": "Thunderbird/Thunderbird.log",
        'log_format': '<Label> <Timestamp> <Date> <User> <Month> <Day> <Time> <Location> <Component>(\[<PID>\])?: <Content>',
        'regex': [r'(\d+\.){3}\d+'],
        "lines": 211212192,
        },

    'Windows': {
#        'log_file': 'Windows/Windows_2k.log',
        "log_file": "Windows/Windows.log",
        'log_format': '<Date> <Time>, <Level>                  <Component>    <Content>',
        'regex': [r'0x.*?\s'],
        "lines": 114608388,
        },

    'Linux': {
#        'log_file': 'Linux/Linux_2k.log',
        "log_file": "Linux/Linux.log",
        'log_format': '<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>',
        'regex': [r'(\d+\.){3}\d+', r'\d{2}:\d{2}:\d{2}',r'J([a-z]{2})'],
        "lines": 25567,
        },

    'Android': {
#        'log_file': 'Android/Android_2k.log',
        "log_file": "Android_v1/Android.log",
        'log_format': '<Date> <Time>  <Pid>  <Tid> <Level> <Component>: <Content>',
        'regex': [r'(/[\w-]+)+', r'([\w-]+\.){2,}[\w-]+', r'\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b'],
        "lines": 1555005,
        },

    'HealthApp': {
#        'log_file': 'HealthApp/HealthApp_2k.log',
        "log_file": "HealthApp/HealthApp.log",
        'log_format': '<Time>\|<Component>\|<Pid>\|<Content>',
        'regex': [],
        "lines": 253395,
        },

    'Apache': {
#        'log_file': 'Apache/Apache_2k.log',
        "log_file": "Apache/Apache.log",
        'log_format': '\[<Time>\] \[<Level>\] <Content>',
        'regex': [r'(\d+\.){3}\d+'],
        "lines": 56482,
        },
    'Proxifier': {
#        'log_file': 'Proxifier/Proxifier_2k.log',
        "log_file": "Proxifier/Proxifier.log",
        'log_format': '\[<Time>\] <Program> - <Content>',
        'regex': [r'<\d+\ssec', r'([\w-]+\.)+[\w-]+(:\d+)?', r'\d{2}:\d{2}(:\d{2})*', r'[KGTM]B'],
        "lines": 21329,
    },

    'OpenSSH': {
#        'log_file': 'OpenSSH/OpenSSH_2k.log',
        "log_file": "SSH/SSH.log",
        'log_format': '<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>',
        'regex': [r'(\d+\.){3}\d+', r'([\w-]+\.){2,}[\w-]+'],
        "lines": 655147,
        },

    'OpenStack': {
#        'log_file': 'OpenStack/OpenStack_2k.log',
        "log_file": "OpenStack/OpenStack_collected.log",
        'log_format': '<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>',
        'regex': [r'((\d+\.){3}\d+,?)+', r'/.+?\s ', r'\d+'],
        "lines": 207820,
        },

    'Mac': {
#        'log_file': 'Mac/Mac_2k.log',
        "log_file": "Mac/Mac.log",
        'log_format': '<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>',
        'regex': [r'([\w-]+\.){2,}[\w-]+'],
        "lines": 117283,
        },
}

import re
# from Brain (same as Drain)
def generate_logformat_regex(logformat:str)->tuple[list[str], re.Pattern]:
    """ Function to generate regular expression to split log messages
    """
    headers = []
    splitters = re.split(r'(<[^<>]+>)', logformat)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += '(?P<%s>.*?)' % header
            # regex += "(?P<%s>[\s\S]*?)" % header
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

# Collect **All** Templates

In [None]:
import datetime
import re
from util.logparsing import NumbVar

def skip_line(line:str)->bool:
    if content is None:
        return True
    if line.strip() == '':
        return True
    if line.lower() in ['none', 'null', 'nan', 'n/a', 'na']:
        return True
    return False

template_files={}

for dataset, config in config.items():
    output_file=os.path.join(OUT_DIR,OUTPUT_FILE_ALL_TEMPLATES.format(dataset=dataset))
    template_files[dataset]=output_file

    if os.path.exists(output_file):
        print(f"File {output_file} already exists, skipping")
        continue

    log_file = os.path.join(INPUT_DIR, config["log_file"])
    log_format:str=config["log_format"]
    headers: list[str]
    format_regex: re.Pattern
    headers, format_regex = generate_logformat_regex(log_format)

    lines_count=int(config["lines"])

    print(f"\nProcessing {dataset} ({log_file})")

    def get_content_from_line(cur_line:str)->str:
        try:
            match = format_regex.search(cur_line.strip())
            log_line_dict = {header:match.group(header) for header in headers}
            content=log_line_dict['Content']
            if skip_line(content):
                raise ValueError("Line skipped")
            return content
        except Exception as e:
            return ''

    starttime=datetime.datetime.now()

    numb_var=NumbVar()

    with open(log_file, 'r', encoding='utf8', errors='ignore') as input_file: #, buffering=1024*1024) as input_file:
        i=0
        for line in input_file: #.readline():
            content=get_content_from_line(line)
            if content=='':
                continue
            
            template,id=numb_var.parse(content)
            
            if i % (lines_count // 100) == 0:
                print(f"Processed {i} lines ({i/lines_count*100:.2f}%), got {len(numb_var.get_templates_list())} templates")
                elapsed_time = datetime.datetime.now() - starttime
                lines_per_second = i / elapsed_time.total_seconds()
                print(f"  Elapsed time: {elapsed_time}, Lines per second: {lines_per_second:.2f}")
            i+=1
            
        with open(output_file, "w", encoding="utf-8") as output_file:
            for tmpl in numb_var.get_templates_list():
                output_file.write(f"{tmpl}\n")

    t_count=len(numb_var.get_templates_list())
    print(f'Mined {t_count} templates')    

#HDFS with 11175629 lines takes more than 2 min 40
# for all 16 datasets it last approx. 9 hours

In [None]:
order_of_dss = ['HDFS',
                'Hadoop',
                'Spark',
                'Zookeeper',
                'OpenStack',
                'BGL',
                'HPC',
                'Thunderbird',
                'Windows',
                'Linux',
                'Mac',
                'Android',
                'HealthApp',
                'Apache',
                'OpenSSH',
                'Proxifier']

In [None]:
import pandas as pd
templates_numb_var = {}
for dataset, file_path in template_files.items():
    with open(file_path, 'r', encoding='utf-8') as f:
        line_count = sum(1 for _ in f)
    templates_numb_var[dataset] = line_count
templates_numb_var

In [None]:


templates_numb_var_df=pd.DataFrame.from_dict(templates_numb_var, orient='index', columns=['total_templates'])
templates_numb_var_df['total_lines'] = [c['lines'] for _,c in config.items()]
templates_numb_var_df['lines_per_template']= round(templates_numb_var_df['total_lines']/templates_numb_var_df['total_templates']).astype(int)
templates_numb_var_df = templates_numb_var_df[['total_lines', 'total_templates', 'lines_per_template']]
templates_numb_var_df = templates_numb_var_df.reindex(order_of_dss)
templates_numb_var_df

# Collect 10.000 Lines each

In [None]:
import random
import pickle


def skip_line(line:str)->bool:
    if content is None:
        return True
    if line.strip() == '':
        return True
    if line.lower() in ['none', 'null', 'nan', 'n/a', 'na']:
        return True
    return False


logs_10k={}

for dataset, config in config.items():
    output_file=os.path.join(OUT_DIR,OUTPUT_FILE_N_LINES.format(dataset=dataset, N=N, SEED=SEED))
    logs_10k[dataset]=output_file

    if os.path.exists(output_file):
        print(f"File {output_file} already exists, skipping")
        continue


    log_file = os.path.join(INPUT_DIR, config["log_file"])
    log_format:str=config["log_format"]
    headers: list[str]
    format_regex: re.Pattern
    headers, format_regex = generate_logformat_regex(log_format)

    lines_count=config["lines"]

    print(f"\nProcessing {dataset} ({log_file})")
    random.seed(SEED)

    # use dow while to fill up to N
    lines_to_collect = N
    log_content:list[tuple[str,int,str]]=[]
    retries=0
    runs=0

    while True:
        runs+=1
        if len(log_content)>=N:
            break

        lines_to_collect = N-len(log_content)
        selected_indices = dict.fromkeys(sorted(list(set(random.sample(range(lines_count), lines_to_collect)))),None)
        
        with open(log_file, 'r', encoding='utf8', errors='ignore') as input_file:
            skipped_lines=0
            for i,cur_line in enumerate(input_file):
                if i not in selected_indices:
                    continue
                #content=''
                try:
                    match = format_regex.search(cur_line.strip())
                    #message = [match.group(header) for header in headers]
                    log_line_dict = {header:match.group(header) for header in headers}
                    content=log_line_dict['Content']
                    if skip_line(content):
                        raise ValueError("Line skipped")
                    log_content.append((dataset,i,content))
                except Exception as e:
                    skipped_lines+=1
            
            retries+=skipped_lines
    pickle.dump(log_content, open(output_file, "wb"))

    print(f'{len(log_content)} lines collected')
    print(f"Retried {retries} lines in {runs} runs")
    


In [None]:
import pandas as pd
import pickle

examples_df=pd.DataFrame(columns=['Dataset','Line','Example'])
for dataset, config in config.items():
    file=logs_10k[dataset]
    log_content=pickle.load(open(file, "rb"))
    examples_df=pd.concat([examples_df,pd.DataFrame(log_content, columns=['Dataset','Line','Example'])])
examples_df


# Collect Templates on 10k Lines

In [None]:
from util.logparsing import NumbVar

#mined_templates:list[tuple[str,str]]=[]
mined_templates_count=0

examples_df['Template'] = ''
examples_df['ClusterId'] = ''

templates_in_10k={}
for dataset, config in config.items():
    print(f"Processing {dataset}", end=' ')

    dataset_examples_df:pd.DataFrame=examples_df[examples_df['Dataset']==dataset]
    log_contents=dataset_examples_df['Example'].tolist()
    
    templates:list[tuple[str,int]]=[]
    numb_var=NumbVar()
    for content in log_contents:
        template, id = numb_var.parse(content)
        templates.append((template, id))
    
    examples_df.loc[examples_df['Dataset'] == dataset, ['Template', 'ClusterId']] = templates
    
    templates_count=max([i for _,i in templates])

    templates_in_10k[dataset]=templates_count
    print(f'Mined {templates_count} templates')
    mined_templates_count+=templates_count

print(f"Total mined templates: {mined_templates_count}")
templates_numb_var_df['templates_in_10k']=templates_in_10k

examples_df

In [None]:
templates_numb_var_df

In [None]:
print(f"Examples: {len(examples_df)}")
print(f"Empty: {len(examples_df[examples_df['Example']==''])}")

# keep unique dataset, clusterid olnly
examples_df_unique=examples_df.drop_duplicates(subset=['Dataset','ClusterId'])
print(f"Different Templates: {len(examples_df_unique)}")
examples_df_unique

# drop empty examples
examples_df_unique=examples_df_unique[examples_df_unique['Example']!='']
print(f"Without empty: {len(examples_df_unique)}")

# drop examples of string 'null' and 'nan'
examples_df_unique=examples_df_unique[examples_df_unique['Example']!='null']
examples_df_unique=examples_df_unique[examples_df_unique['Example']!='nan']
print(f"Without null/nan: {len(examples_df_unique)}")

In [None]:
import pandas as pd
print(f"Save to '{OUTPUT_FILE}'")
examples_df_unique.to_csv(OUTPUT_FILE, index=False)
examples_df_unique

In [None]:
templates_in_10k

In [None]:
templates_numb_var_df['ratio_lines']=N/templates_numb_var_df['total_lines']
templates_numb_var_df['ratio_templates']=templates_numb_var_df['templates_in_10k']/templates_numb_var_df['total_templates']
templates_numb_var_df

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Plotting
fig, ax = plt.subplots(figsize=(12, 6))

# Bar width
width = 0.35

# X positions for the datasets
x = np.arange(len(templates_numb_var_df.index))

# Bar data
y1 = templates_numb_var_df['ratio_lines']
y2 = templates_numb_var_df['ratio_templates']

# Plot bars
ax.bar(x - width/2, y1, width, label='Ratio Lines')
ax.bar(x + width/2, y2, width, label='Ratio Templates')

# Add labels, title, and legend
ax.set_xlabel('Dataset')
ax.set_ylabel('Ratio')
ax.set_title('Ratio Lines and Ratio Templates by Dataset')
ax.set_xticks(x)
ax.set_xticklabels(templates_numb_var_df.index, rotation=45, ha='right')
ax.legend()

# Show plot
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Plotting
fig, ax = plt.subplots(figsize=(12, 8))

# Bar height
height = 1.0 / 5

# Y positions for the datasets
y = np.arange(len(templates_numb_var_df.index))

# Bar data
x1 = templates_numb_var_df['ratio_lines']
x2 = templates_numb_var_df['ratio_templates']
x3 = templates_numb_var_df['total_lines']
x4 = templates_numb_var_df['total_templates']

# Plot bars
ax.barh(y - height*2, x1, height, label='Ratio Lines')
ax.barh(y-height, x2, height, label='Ratio Templates')

# Add total lines as a secondary axis
ax2 = ax.twiny()
ax2.barh(y + height, x3, height, color='gray', alpha=0.3, label='Total Lines')
ax2.barh(y+height*2,x4, height, color='green', alpha=0.3, label='Total Templates')

# Add values to the end of the bars
for i, (val1, val2, val3, val4) in enumerate(zip(x1, x2, x3, x4)):
    ax.text(val1, y[i] - height*2, f'{val1*100:.1f}%', va='center', ha='left', fontsize=7)
    ax.text(val2, y[i] - height, f'{val2*100:.1f}%', va='center', ha='left', fontsize=7)
    ax2.text(val3, y[i] + height, f'{val3:,}', va='center', ha='left', fontsize=7)
    ax2.text(val4, y[i] + height*2, f'{val4:,}', va='center', ha='left', fontsize=7)

# Add labels, title, and legend
ax.set_xlabel('Ratio')
ax.set_ylabel('Dataset')
ax.set_title('Horizontal Bar Chart: Ratio Lines, Ratio Templates, and Total Lines by Dataset')
ax.set_yticks(y)
ax.set_yticklabels(templates_numb_var_df.index)
ax.legend(loc='lower right')

ax2.set_xlabel('Total Lines')
ax2.legend(loc='upper right')

# Show plot
plt.tight_layout()
plt.show()

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

import matplotlib.pyplot as plt

# Data for the pie chart
labels = templates_numb_var_df.index
sizes = templates_numb_var_df['total_lines']
colors = plt.cm.tab20.colors  # Use a colormap for better visualization

# Plotting the pie chart
fig, ax = plt.subplots(figsize=(12, 8))
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors)
ax.set_title('Distribution of Total Lines Across Datasets')

# Add a table next to the pie chart
divider = make_axes_locatable(ax)
ax_table = divider.append_axes("right", size="40%", pad=0.1)

# Prepare data for the table
table_data = templates_numb_var_df[['total_lines', 'total_templates']].reset_index()
table_data.columns = ['Dataset', 'Total Lines', 'Total Templates']

# Hide the axis for the table
ax_table.axis('off')

# Create the table
table = ax_table.table(cellText=table_data.values, colLabels=table_data.columns, loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.auto_set_column_width(col=list(range(len(table_data.columns))))

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

import matplotlib.pyplot as plt

# Data for the pie chart
labels = templates_numb_var_df.index
sizes = templates_numb_var_df['total_templates']
colors = plt.cm.tab20.colors  # Use a colormap for better visualization

# Plotting the pie chart
fig, ax = plt.subplots(figsize=(12, 8))
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors)
ax.set_title('Distribution of Total Templates Across Datasets')

# Add a table next to the pie chart
divider = make_axes_locatable(ax)
ax_table = divider.append_axes("right", size="40%", pad=0.1)

# Prepare data for the table
table_data = templates_numb_var_df[['total_lines', 'total_templates']].reset_index()
table_data.columns = ['Dataset', 'Total Lines', 'Total Templates']

# Hide the axis for the table
ax_table.axis('off')

# Create the table
table = ax_table.table(cellText=table_data.values, colLabels=table_data.columns, loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.auto_set_column_width(col=list(range(len(table_data.columns))))

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

import matplotlib.pyplot as plt

# Data for the pie chart
labels = templates_numb_var_df.index
sizes = templates_numb_var_df['templates_in_10k']
colors = plt.cm.tab20.colors  # Use a colormap for better visualization

# Plotting the pie chart
fig, ax = plt.subplots(figsize=(12, 8))
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors)
ax.set_title('Distribution of Mined Templates (10k each)')

# Add a table next to the pie chart
divider = make_axes_locatable(ax)
ax_table = divider.append_axes("right", size="40%", pad=0.1)

# Prepare data for the table
table_data = templates_numb_var_df['templates_in_10k'].reset_index()
table_data.columns = ['Dataset', 'Templates in 10k']
table_data.sort_values(by='Templates in 10k', ascending=False, inplace=True)

# Hide the axis for the table
ax_table.axis('off')

# Create the table
table = ax_table.table(cellText=table_data.values, colLabels=table_data.columns, loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.auto_set_column_width(col=list(range(len(table_data.columns))))

# Show the plot
plt.tight_layout()
plt.show()