In [43]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

import os
import pandas as pd
import numpy as np
from typing import Dict, Set, Tuple, List, Union, Callable
from collections import defaultdict
import random
import itertools
import re
from sklearn.cluster import AgglomerativeClustering, Birch
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def read_dataframes_from_folder(folder_path: str, file_limit: int) -> List[Tuple[pd.DataFrame, str]]:
    """
    Read dataframes from CSV files in the given folder.

    Args:
        folder_path (str): Path to the folder containing CSV files.
        file_limit (int): Maximum number of files to be read.

    Returns:
        List[Tuple[pd.DataFrame, str]]: A list of tuples containing the dataframes and their corresponding filenames.
    """
    tuple_dataframes = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            if len(tuple_dataframes) < file_limit:
                file_path = os.path.join(folder_path, filename)
                df = pd.read_csv(file_path)
                tuple_dataframes.append((df, os.path.basename(file_path)))
            else:
                break

    return tuple_dataframes

In [5]:
def clean_dataframes(tuple_dataframes: List[Tuple[pd.DataFrame, str]]) -> None:
    """
    Clean the dataframes by renaming columns, dropping the 'Unnamed: 0' column, and dropping empty columns.

    Args:
        tuple_dataframes (List[Tuple[pd.DataFrame, str]]): List of tuples containing the dataframes and their corresponding filenames.
    """
    
    # inner function
    def drop_empty_columns_in_dataframes(dataframes: List[Tuple[pd.DataFrame, str]]) -> None:
        for df, _ in dataframes:
            empty_columns = df.columns[df.isnull().all()].tolist()  # get the list of empty columns
            df.drop(empty_columns, axis=1, inplace=True)  # drop the empty columns
    
    # renaming columns, deleting 'Unnamed: 0', and removing empty dataframes
    indices_to_delete = []
    for index, (df, _) in enumerate(tuple_dataframes):
        df.rename(columns=lambda x: x.replace('col', ''), inplace=True)
        
        if 'Unnamed: 0' in df.columns:
            df.drop('Unnamed: 0', axis=1, inplace=True)  # delete the "Unnamed: 0" column
        
        if df.empty:
            indices_to_delete.append(index)  # mark dataframe for deletion if empty
    
    # reverse sort the indices and delete the empty dataframes
    for index in sorted(indices_to_delete, reverse=True):
        del tuple_dataframes[index]

    # call inner funtion
    drop_empty_columns_in_dataframes(tuple_dataframes)

In [6]:
def drop_columns_without_label(dataframes: List[Tuple[pd.DataFrame, str]], label_df: pd.DataFrame) -> List[Tuple[pd.DataFrame, str]]:
    """
    Drop columns for which no gold standard exists.

    Args:
        dataframes (List[Tuple[pd.DataFrame, str]]): List of tuples containing the dataframes and their corresponding filenames.
        label_df (pd.DataFrame): DataFrame containing the gold standard annotations.

    Returns:
        List[Tuple[pd.DataFrame, str]]: List of tuples containing the cleaned dataframes and their corresponding filenames.
    """
    
    # make a copy
    tuple_dataframes_copy = [(df.copy(), name) for df, name in dataframes]

    # indices of dataframes to delete
    to_delete = []

    # iterate over dataframes
    for table_index, (df, filename) in enumerate(tuple_dataframes_copy):
        for column_index, col in enumerate(df.columns):
            if not any((label_df['table_id'] == filename[:-4] + "_dbpedia") & (label_df['target_column'] == column_index)):
                df.drop(col, axis=1, inplace=True)
        if df.empty:
            to_delete.append(table_index)
        else:
            tuple_dataframes_copy[table_index] = (df, filename)

    # delete empty dataframes from the list in reverse order to prevent index shifting
    for index in reversed(to_delete):
        del tuple_dataframes_copy[index]

    return tuple_dataframes_copy


In [7]:
def sample_columns(tuple_dataframes: List[Tuple[pd.DataFrame, str]], n: int) -> List[Tuple[pd.DataFrame, str]]:
    """
    Sample 'n' columns from all DataFrames combined, delete non-sampled columns, and filter out empty DataFrames.

    Args:
        tuple_dataframes (List[Tuple[pd.DataFrame, str]]): A list of tuples containing the dataframes and their corresponding filenames.
        n (int): Number of columns to sample in total.

    Returns:
        List[Tuple[pd.DataFrame, str]]: A list of tuples containing the non-empty dataframes and their corresponding filenames.
    """
    # add prefixes to ensure unique column names in combined DataFrame
    all_dfs = []
    for idx, (df, _) in enumerate(tuple_dataframes):
        df_prefixed = df.add_prefix(f"df{idx}_")
        all_dfs.append(df_prefixed)

    combined_df = pd.concat(all_dfs, axis=1)

    # sample 'n' columns from the combined DataFrame
    if n >= len(combined_df.columns):
        sampled_columns = combined_df.columns
    else:
        sampled_columns = random.sample(list(combined_df.columns), n)

    # filter combined DataFrame to keep only the sampled columns
    sampled_combined_df = combined_df[sampled_columns]

    filtered_dataframes = []

    # split the combined DataFrame back into individual DataFrames and filter out any empty DataFrames
    for idx, (_, filename) in enumerate(tuple_dataframes):
        relevant_cols = [col for col in sampled_combined_df.columns if col.startswith(f"df{idx}_")]
        df_sampled = sampled_combined_df[relevant_cols].rename(columns=lambda x: x.split("_", 1)[1])
        if not df_sampled.empty:
            filtered_dataframes.append((df_sampled, filename))

    return filtered_dataframes

In [8]:
def create_ground_truth_map(label_df: pd.DataFrame, tuple_dataframes: List[Tuple[pd.DataFrame, str]]) -> Dict[str, Set[int]]:
    """
    Create a ground truth map for evaluation.

    Args:
        label_df (pd.DataFrame): DataFrame containing true labels for evaluation.
        tuple_dataframes (List[Tuple[pd.DataFrame, str]]): List of tuples containing the dataframes and their corresponding filenames.

    Returns:
        Dict[str, Set[int]]: A dictionary mapping label names to sets of global indices that belong to each label.
    """
    ground_truth = defaultdict(set)

    for table_index, (df, filename) in enumerate(tuple_dataframes):
        for new_col_index, old_col_index in enumerate(df.columns):
            label = label_df.query(f"table_id == '{filename[:-4] + '_dbpedia'}' and target_column == {old_col_index}")['annotation_label']

            if label.empty:
                continue

            global_index = sum(df.shape[1] for df, _ in tuple_dataframes[:table_index]) + new_col_index
            ground_truth[label.iloc[0]].add(global_index)

            if len(label.values) != 1:
                raise ValueError("Unexpected number of label values")

    for tuple_dataframe in tuple_dataframes:
        tuple_dataframe[0].columns = range(len(tuple_dataframe[0].columns))

    return ground_truth

In [9]:
def hierarchical_clustering(distance_matrix: np.ndarray, n_clusters: int) -> Dict[int, Set[int]]:
    """
    Perform hierarchical clustering on the distance matrix.

    Args:
        distance_matrix (np.ndarray): Pairwise distance matrix.
        n_clusters (int): Number of clusters to create.

    Returns:
        Dict[int, Set[int]]: A dictionary mapping cluster labels to sets of global indices that belong to each cluster.
    """
    # create an AgglomerativeClustering model
    model = AgglomerativeClustering(n_clusters=n_clusters, metric='precomputed', linkage='average')

    # fit the model to the distance matrix
    model.fit(distance_matrix)

    # get the cluster labels
    labels = model.labels_

    # create a dictionary that maps each cluster label to the set of indices that belong to that cluster
    clusters = defaultdict(set)
    for idx, label in enumerate(labels):
        clusters[label].add(idx)

    return clusters

In [12]:
def evaluate_micro(truth_dict: Dict[str, Set[int]], result_dict: Dict[int, Set[int]]) -> Tuple[float, float]:
    """
    Evaluate clustering performance using micro-average precision, recall.

    Args:
        truth_dict (Dict[str, Set[int]]): Ground truth mapping label names to sets of global indices.
        result_dict (Dict[int, Set[int]]): Resulting clustering mapping cluster labels to sets of indices.

    Returns:
        Tuple[float, float]: Micro-average precision and recall.
    """
    # creating the contingency matrix
    contingency_matrix = np.zeros((len(truth_dict), len(result_dict)))

    truth_labels = list(truth_dict.keys())
    result_labels = list(result_dict.keys())

    for i, truth_label in enumerate(truth_labels):
        for j, result_label in enumerate(result_labels):
            # length of intersection between clusters
            # contingency_matrix[i, j] is the number of (global) indices assigned to both clusters
            contingency_matrix[i, j] = len(truth_dict[truth_label] & result_dict[result_label])

    # calculate precision, recall, and f1-score for each label and average them
    total_precision = 0
    total_recall = 0
    total_f1_score = 0
    total_weight = 0

    for i, _ in enumerate(result_labels):
        tp = np.max(contingency_matrix[:, i])  # maximum element in column i (True Positives)
        best_match_index = np.argmax(contingency_matrix[:, i])  # get the index of the best match (TP - index)
        tp_plus_fp = np.sum(contingency_matrix[best_match_index, :])  # sum of elements in row of best match (TP + FP)
        tp_plus_fn = np.sum(contingency_matrix[:, i])  # sum of elements in column i (TP + FN)

        weight = tp_plus_fn  # the weight for each label is the total number of true positives plus false negatives

        if tp_plus_fp > 0:
            precision = tp / tp_plus_fp
            total_precision += weight * precision
        if tp_plus_fn > 0:
            recall = tp / tp_plus_fn
            total_recall += weight * recall

        total_weight += weight

    avg_precision = total_precision / total_weight
    avg_recall = total_recall / total_weight

    return avg_precision, avg_recall

In [14]:
def cosine_distance(emb1: List[float], emb2: List[float]) -> float:
    """
    Compute the cosine distance between two embeddings.

    Args:
        emb1 (List[float]): First embedding.
        emb2 (List[float]): Second embedding.

    Returns:
        float: Cosine distance between the two embeddings. Ranges between 0 (identical) and 2 (completely opposite).
    """
    return 1 - np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))


In [28]:
# 1. Number of Entries
def number_of_entries(column: pd.Series) -> int:
    return len(column)

# 2. Maximum Entry Length
def max_entry_length(column: pd.Series) -> int:
    return column.astype(str).str.len().max()

# 3. Average Alphabetic Characters per Entry
def avg_alpha_characters(column: pd.Series) -> float:
    return column.apply(lambda x: sum(c.isalpha() for c in str(x))).mean()

# 4. Proportion of Entries Containing Numbers
def proportion_of_numeric_entries(column: pd.Series) -> float:
    return column.apply(lambda x: str(x).replace(".", "", 1).isdigit()).mean()

# 5. Column Entropy
def column_entropy(column: pd.Series) -> float:
    value_counts = column.value_counts(normalize=True)
    return -np.sum(value_counts * np.log2(value_counts))

# 6. Proportion of Entries with Letters
def proportion_of_entries_with_letters(column: pd.Series) -> float:
    return column.astype(str).apply(lambda x: any(c.isalpha() for c in x)).mean()

# 7. Number of Empty Entries
def number_of_empty_entries(column: pd.Series) -> int:
    return column.isna().sum()

# 8. Average Length of Entries
def average_length_of_entries(column: pd.Series) -> float:
    return column.astype(str).str.len().mean()

# 9. Proportion of Distinct Values
def ratio_of_unique_entries(column: pd.Series) -> float:
    return column.nunique() / len(column)

# 10. Average Number of Numerical Characters per Entry
def avg_number_of_numerical_characters(column: pd.Series) -> float:
    return column.apply(lambda x: sum(c.isdigit() for c in str(x))).mean()

# 11-13. Mean, Median and Standard Deviation of Numeric Entries
def get_numeric_entries(column: pd.Series) -> pd.Series:
    numeric_entries = column.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    return numeric_entries[numeric_entries.notna()]

def mean_of_numeric_entries(column: pd.Series) -> float:
    numeric_entries = get_numeric_entries(column)
    return numeric_entries.mean()

def median_of_numeric_entries(column: pd.Series) -> float:
    numeric_entries = get_numeric_entries(column)
    return numeric_entries.median()

def std_dev_of_numeric_entries(column: pd.Series) -> float:
    numeric_entries = get_numeric_entries(column)
    return numeric_entries.std()

# 14. Proportion of Alphanumerical Entries
def proportion_of_alphanumeric_entries(column: pd.Series) -> float:
    alphanumeric_pattern = re.compile(r'\w')
    return column.astype(str).apply(lambda x: bool(alphanumeric_pattern.match(x))).mean()

# 15. Proportion of Numeric-only Entries
def proportion_of_integer_entries(column: pd.Series) -> float:
    return column.apply(lambda x: str(x).isdigit()).mean()

# 16. Proportion of Data Entries
def proportion_of_date_entries(column: pd.Series) -> float:
    date_pattern = re.compile(r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b')
    return column.astype(str).apply(lambda x: bool(date_pattern.match(x))).mean()

# 17. Proportion of Entries with Special Characters
def proportion_of_entries_with_special_characters(column: pd.Series) -> float:
    return column.astype(str).apply(lambda x: any(not c.isalnum() and not c.isspace() for c in x)).mean()

# 18. Proportion of Entries Containing Spaces
def proportion_of_entries_with_spaces(column: pd.Series) -> float:
    return column.astype(str).apply(lambda x: ' ' in x).mean()

# 19. Proportion of Link Entries
def proportion_of_link_entries(column: pd.Series) -> float:
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return column.astype(str).apply(lambda x: bool(url_pattern.search(x))).mean()

# 20. Proportion of Entries Initiating with Capital Letters
def proportion_of_capitalized_entries(column: pd.Series) -> float:
    return column.astype(str).apply(lambda x: x[0].isupper() if x else False).mean()

# 21. Proportion of Entries in Uppercase
def proportion_of_fully_capitalized_entries(column: pd.Series) -> float:
    return column.astype(str).apply(lambda x: x.isupper()).mean()

# 22. Proportion of Words Starting with a Capital Letter
def proportion_of_capitalized_words(column: pd.Series) -> float:
    return column.astype(str).apply(lambda x: sum(1 for word in x.split() if word.istitle())).mean()

feature_extractors = {
    "Number of Entries": number_of_entries,
    "Maximum Entry Length": max_entry_length,
    "Average Alphabetic Characters per Entry": avg_alpha_characters,
    "Proportion of Entries Containing Numbers": proportion_of_numeric_entries,
    "Column Entropy": column_entropy,
    "Proportion of Entries with Letters": proportion_of_entries_with_letters,
    "Number of Empty Entries": number_of_empty_entries,
    "Average Length of Entries": average_length_of_entries,
    "Proportion of Distinct Values": ratio_of_unique_entries,
    "Average Number of Numerical Characters per Entry": avg_number_of_numerical_characters,
    "Mean Numeric Value": mean_of_numeric_entries,
    "Median of Numeric Values": median_of_numeric_entries,
    "Spread of Numeric Values (Standard Deviation)": std_dev_of_numeric_entries,
    "Proportion of Alphanumerical Entries": proportion_of_alphanumeric_entries,
    "Proportion of Numeric-only Entries": proportion_of_integer_entries,
    "Proportion of Data Entries": proportion_of_date_entries,
    "Proportion of Entries with Special Characters": proportion_of_entries_with_special_characters,
    "Proportion of Entries Containing Spaces": proportion_of_entries_with_spaces,
    "Proportion of Link Entries": proportion_of_link_entries,
    "Proportion of Entries Initiating with Capital Letters": proportion_of_capitalized_entries,
    "Proportion of Entries in Uppercase": proportion_of_fully_capitalized_entries,
    "Proportion of Words Starting with a Capital Letter": proportion_of_capitalized_words
}

def extract_features(dataframes: list) -> pd.DataFrame:
    """
    Extracts features
    
    Args:
       dataframes (list): data from which features are extracted.
       
    Returs:
        pd.DataFrame: feature dataframe
    """
    # initialize an empty DataFrame to store the features
    features = pd.DataFrame(columns=feature_extractors.keys())
    
    for df in dataframes:
        
        # dataframe for features of current df
        temp_features = pd.DataFrame(columns=feature_extractors.keys())  
        
        for column_name, column_data in df.items():
            
            # convert column_data to a pd.Series of strings
            column_data_series = pd.Series(map(str, column_data))
            
            for feature_name, feature_extractor in feature_extractors.items():
                
                # loc at (column_name, feature_name)
                temp_features.loc[column_name, feature_name] = feature_extractor(column_data_series)
                
        # rowwise concatenation        
        features = pd.concat([features, temp_features]) 
        
    return features

In [21]:
def z_normalize_features(feature_df: pd.DataFrame) -> np.ndarray:
    """
    Z-normalize a set of features.

    Args:
        feature_df (pd.DataFrame): Extracted features

    Returns:
        np.ndarray: The z-normalized features.
    """
    # compute the mean and standard deviation for each feature (column).
    means = np.mean(feature_df, axis=0)
    stds = np.std(feature_df, axis=0)

    # z-normalize each feature in the feature set.
    z_normalized_features = (feature_df - means) / stds

    return z_normalized_features

In [24]:
def pre_clustering(feature_df: pd.DataFrame, n_clusters: int) -> Dict[int, List[int]]:
    """
    Pre-Clustering using Birch.
    
    Args:
        feature_df (pd.DataFrame): Z-normalized features.
        n_clusters (int): number of pre-clusters.
        
    Returns: 
        
        Dict[int, List[int]]: Dict[int, Set[int]]: A dictionary mapping cluster labels to sets of global indices that belong to each cluster.
    """
    
    # initializing Birch and fit it to the data
    birch_model = Birch(n_clusters=n_clusters)
    birch_model.fit(feature_df)

    # get the cluster labels
    labels = birch_model.labels_

    # create a dictionary that maps each cluster label to a list of column indices
    cluster_to_columns = defaultdict(list)

    for idx, label in enumerate(labels):
        cluster_to_columns[label].append(idx) # globale index

    return dict(cluster_to_columns)

In [32]:
def calculate_column_similarity(cluster: List[List], s) -> np.ndarray:
    """
    inner function
    """
    # convert cluster columns to dataframes for uniform processing
    dfs = [pd.DataFrame(col) for col in cluster]
    
    column_embeddings = {}
    count = 0
    for df in dfs:
        for col in df.columns:
            # sample a subset of entries from the column
            samples = df[col].dropna().astype(str)
            samples = samples.sample(min(len(samples), s))

            # Embed
            sample_embeddings = model.encode(samples.tolist())
            column_embeddings[count] = np.mean(sample_embeddings, axis=0)
            count += 1

    # compute the distance
    embedding_matrix = np.vstack(list(column_embeddings.values()))
    distance = 1 - cosine_similarity(embedding_matrix)
    
    return distance
        
    
def find_centroid(cluster: List[List], s) -> Tuple[int, np.ndarray]:
    """
    inner function
    """
    distance_matrix = calculate_column_similarity(cluster, s)
    centroid_intern_index = np.argmin(np.sum(distance_matrix, axis=0))
    return centroid_intern_index, distance_matrix

def compute_cluster_centroids(dataframes: List[pd.DataFrame],
                              cluster_to_columns: Dict[int, List[int]],
                              s: int,
                              index_to_tuple: Callable[[int], Tuple[int, int]]) -> Dict[int, Tuple[int, np.ndarray]]:
    """
    Compute the (pre-)cluster centroids and saves internal distance matrices 

    Args:
        dataframes (List[pd.DataFrame]): List of dataframes containing the data.
        cluster_to_columns (Dict[int, List[int]]): Mapping of global cluster IDs to lists of global column indices.
        s (int): sampling parameter
        index_to_tuple (Callable[[int], Tuple[int, int]]): global to local index map

    Returns:
        Dict[int, Tuple[int, np.ndarray]]: Dictionary where keys are cluster IDs and values are tuples consisting of:
            - The global index of the centroid column.
            - A numpy array (presumed to be a distance matrix for the centroid).

    """
    
    centroids = {}

    for cluster_id, column_indices in cluster_to_columns.items():
        cluster_columns = [dataframes[df_index].iloc[:, col_index].tolist() for df_index, col_index in map(index_to_tuple, column_indices)]
        centroid_intern_index, distance_matrix = find_centroid(cluster_columns, s)
        centroid_global_index = column_indices[centroid_intern_index]
        centroids[cluster_id] = (centroid_global_index, distance_matrix)

    return centroids

In [36]:
def compute_approximated_distance_matrix_optimized(dataframes: List[pd.DataFrame],
                                                   cluster_to_columns: Dict[int, List[int]], 
                                                   centroids: Dict[int, Tuple[int, np.ndarray]],
                                                   s: int,
                                                  index_to_tuple) -> np.ndarray:
    
    """
    Approximates general distance matrix
    
    Args:
        dataframes (List[pd.DataFrame]): List of dataframes containing the data.
        cluster_to_columns (Dict[int, List[int]]): Mapping of global cluster IDs to lists of global column indices.
        s (int): sampling parameter.
        index_to_tuple (Callable[[int], Tuple[int, int]]): global to local index map.
        
    Returns:
        np.ndarray: Approximated distance matrix. 
    """

    
    # initialize model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # create a dictionary to store the embeddings for each centroid column
    centroid_embeddings = {}
    
    # get embeddings for all centroid columns
    for cluster_id, (centroid_index, _) in centroids.items():
        double_index = index_to_tuple(centroid_index)
        centroid_column = dataframes[double_index[0]].iloc[:, double_index[1]]
        
        # sample s entries
        filtered_column = centroid_column.dropna()
        samples = filtered_column.sample(min(len(filtered_column), s)).astype(str)

        centroid_embeddings[cluster_id] = np.mean(model.encode(samples.tolist()), axis=0)
    
    # prepare the embedding matrix for centroids
    embedding_matrix = np.vstack(list(centroid_embeddings.values()))
    
    
    # calculate cosine distance for centroids
    centroid_distance_matrix = 1 - cosine_similarity(embedding_matrix)
    
    # initialization of approximated distance matrix
    n = sum(df.shape[1] for df in dataframes)
    approx_distance_matrix = np.zeros((n, n))
    
    # fill in the intra-cluster distances first
    for cluster_id, (_, distance_matrix) in centroids.items():
        cluster_columns = cluster_to_columns[cluster_id]
        approx_distance_matrix[np.ix_(cluster_columns, cluster_columns)] = distance_matrix
    
    # now handle the inter-cluster distances using the centroid distances
    for cluster_id1, cluster_id2 in itertools.combinations(centroids.keys(), 2):
        cluster_columns1 = cluster_to_columns[cluster_id1]
        cluster_columns2 = cluster_to_columns[cluster_id2]

        distance_between_centroids = centroid_distance_matrix[cluster_id1, cluster_id2]

        approx_distance_matrix[np.ix_(cluster_columns1, cluster_columns2)] = distance_between_centroids
        approx_distance_matrix[np.ix_(cluster_columns2, cluster_columns1)] = distance_between_centroids
    
    return approx_distance_matrix

In [39]:
def execute_script() -> dict:
    
    metrics = {}
    
    # path to the folder containing CSV files 
    base_dir = os.path.dirname(os.path.abspath("Baseline.ipynb"))
    folder_path = os.path.join(base_dir, "..","Data", "GitTables", "tables")

    # maximum number of files to be read
    file_limit = 1000

    # read dataframes from the folder
    tuple_dataframes = read_dataframes_from_folder(folder_path, file_limit)

    # reading in true labels for evaluation
    base_dir = os.path.dirname(os.path.abspath("Baseline.ipynb"))
    label_path = os.path.join(base_dir, "..", "Data", "GitTables", "dbpedia_gt.csv")
    label_df = pd.read_csv(label_path)
    
    # clean the dataset and drop columns without gold standard annotations
    clean_dataframes(tuple_dataframes)
    tuple_dataframes_copy = drop_columns_without_label(tuple_dataframes, label_df)

    random_sample = sample_columns(tuple_dataframes_copy, 2000)

    ground_truth_map = create_ground_truth_map(label_df, random_sample)
    
    dataframes = [df for df, _ in random_sample]

    
    index_lookup = {}
    count = 0
    for i in range(len(dataframes)):
        for j in range(len(dataframes[i].columns)):
            index_lookup[count] = (i, j)
            count += 1

    def index_to_tuple(index):
        return index_lookup[index]

    # execute SemJET

    s = 12
    num_pre_cluster = 60
    
    f = extract_features(dataframes)
    f_normalized = z_normalize_features(f.fillna(0))
    
    pre_cl = pre_clustering(f_normalized.fillna(0), num_pre_cluster)
    cluster_centroids = compute_cluster_centroids(dataframes, pre_cl, s, index_to_tuple)
    approx_d = compute_approximated_distance_matrix_optimized(dataframes, pre_cl, cluster_centroids, s, index_to_tuple)                 
    
    
    # clustering hierarchically
    num_hierarchical_clusters = len(ground_truth_map.keys())

    cl = hierarchical_clustering(approx_d, num_hierarchical_clusters)
    
    
    # evaluate clustering performance
    precision, recall = evaluate_micro(ground_truth_map, cl)
    metrics['precision'] = precision
    metrics['recall'] = recall
    metrics['f1_score'] = 2*(precision*recall)/(precision + recall)
    metrics['calculated_embeddings'] = s*(0.5 * num_pre_cluster*(1-num_pre_cluster) 
                                          + sum(0.5 * len(value) * (len(value) - 1) for value in pre_cl.values()))

    return metrics

In [1]:
# EXECUTE HERE

In [44]:
# ADJUST ACCORDING TO PREFERENCE
# FOR STABLE RESULT CONISDER num_iteraitons = 10,
# FOR QUICK EXECUTION num_itertaions = 1
num_iterations = 1

results = [execute_script() for _ in range(num_iterations)]
averages = {key: sum([result[key] for result in results]) / num_iterations for key in results[0]}
print(averages)

{'precision': 0.5674710390754522, 'recall': 0.5166163141993958, 'f1_score': 0.5408508746762136, 'calculated_embeddings': 12857916.0}
