# Evaluation of Frames in Semantic Space
This notebook is used to analyze whether the semantic vector spaces generated through sentence embeddings can capture semantic frames well.

# Import libraries

In [1]:
import os
import functools
import operator
import multiprocessing as mp

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm
from sklearn import metrics
from sklearn.metrics import pairwise_distances 

# Library functions and constants

In [2]:
FRAMENET_DATA_DIR = "../data/parsed_data/parsed_framenet.p"
EMBEDDINGS_DATA_DIR = "./embeddings"
ANALYSIS_DIR = "./analysis"

N_CORES = mp.cpu_count()

In [3]:
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

# Import FrameNet data

In [4]:
data_df = pd.read_pickle(FRAMENET_DATA_DIR)
data_df.head()

Unnamed: 0,corpus,document,sentence,word,phrase_type,semantic_frame
0,LUCorpus-v0.3,enron-thread-159550,"I have completed the invoices for April, May a...",complete,verb,Activity_finish
1,LUCorpus-v0.3,enron-thread-159550,"I have completed the invoices for April, May a...",May,noun,Calendric_unit
2,LUCorpus-v0.3,enron-thread-159550,"I have completed the invoices for April, May a...",June,noun,Calendric_unit
3,LUCorpus-v0.3,enron-thread-159550,"I have completed the invoices for April, May a...",month,noun,Calendric_unit
4,LUCorpus-v0.3,enron-thread-159550,"I have completed the invoices for April, May a...",total,noun,Amounting_to


# Generate embeddings for different models
Pre-trained models can be found on [Github](https://github.com/UKPLab/sentence-transformers#pretrained-models).

In [5]:
def generate_embeddings(embedding_dir, model_name, model, sentences):
    """
    Creates and saves out embeddings for a given model, if embeddings are not found.
    
    Args:
        embedding_dir (string): directory where embedding files should be found.
        model_name (string): name of model to use to generate embeddings.
        model (SentneceTransformer): model to generate embeddings with.
        sentences (list of string): list of sentences to generate embeddings for.
        
    Returns:
        (Pandas DataFrame): Pandas DataFrame containing two columns: sentences, and embeddings.
    """
    # check if embedding file for model already exists
    embedding_filename = "{model}_embeddings.p".format(model=model_name)
    embedding_filepath = os.path.join(embedding_dir, embedding_filename)
    
    if os.path.isfile(embedding_filepath) and os.access(embedding_filepath, os.R_OK):
        print("Embeddings for {model_name} already exist at {file_path}...loading data.".format(model_name=model_name,
                                                                                                file_path=embedding_filepath))
        return pd.read_pickle(embedding_filepath)
    
    # generate embeddings for sentences
    embeddings = model.encode(sentences)
    
    # create dataframe from sentences and embeddings
    embeddings_df = pd.DataFrame({
        "sentence": sentences,
        "embedding": embeddings
    })
    
    # save to file
    embeddings_df.to_pickle(embedding_filepath)
    
    # return dataframe
    return embeddings_df

def get_embeddings(embedding_dir, model_name):
    """
    Returns embeddings for specified model name, or None if not there.
    
    Inputs:
        embedding_dir (string): directory where embedding files should be found.
        model_name (string): name of model to fetch embeddings for.
        
    Output:
        (Pandas DataFrame): Pandas DataFrame containing two columns: sentences, and embeddings.
    """
    # check if embedding file for model already exists
    embedding_filename = "{model}_embeddings.p".format(model=model_name)
    embedding_filepath = os.path.join(embedding_dir, embedding_filename)
    
    if os.path.isfile(embedding_filepath) and os.access(embedding_filepath, os.R_OK):
        return pd.read_pickle(embedding_filepath)
    
    print("Embeddinds for {model_name} not found at {file_path}.".format(model_name=model_name, file_path=embedding_filepath))
    return None

In [6]:
# generate embeddings using all models, if they don't already exist
bert_models = ["bert-base-nli-mean-tokens", "bert-large-nli-mean-tokens", "bert-base-nli-stsb-mean-tokens", "bert-large-nli-stsb-mean-tokens"]
sentences = data_df["sentence"].unique()

for model_str in bert_models:
    generate_embeddings(EMBEDDINGS_DATA_DIR, model_str, SentenceTransformer(model_str), sentences)

Embeddings for bert-base-nli-mean-tokens already exist at ./embeddings/bert-base-nli-mean-tokens_embeddings.p...loading data.
Embeddings for bert-large-nli-mean-tokens already exist at ./embeddings/bert-large-nli-mean-tokens_embeddings.p...loading data.
Embeddings for bert-base-nli-stsb-mean-tokens already exist at ./embeddings/bert-base-nli-stsb-mean-tokens_embeddings.p...loading data.
Embeddings for bert-large-nli-stsb-mean-tokens already exist at ./embeddings/bert-large-nli-stsb-mean-tokens_embeddings.p...loading data.


## Import embeddings

In [7]:
bert_base_nli_mean_tokens_embedding_df = get_embeddings(EMBEDDINGS_DATA_DIR, "bert-base-nli-mean-tokens")
bert_large_nli_mean_tokens_embedding_df = get_embeddings(EMBEDDINGS_DATA_DIR, "bert-large-nli-mean-tokens")
bert_base_nli_stsb_mean_tokens_embedding_df = get_embeddings(EMBEDDINGS_DATA_DIR, "bert-base-nli-stsb-mean-tokens")
bert_large_nli_stsb_mean_tokens_embedding_df = get_embeddings(EMBEDDINGS_DATA_DIR, "bert-large-nli-stsb-mean-tokens")

In [8]:
bert_models = ["bert-base-nli-mean-tokens", "bert-large-nli-mean-tokens", "bert-base-nli-stsb-mean-tokens", "bert-large-nli-stsb-mean-tokens"]
model_df_dict = { name: get_embeddings(EMBEDDINGS_DATA_DIR, name) for name in bert_models }

## Cluster Analysis Code

In [9]:
def intra_cluster_analysis(X, labels, metric):
    """
    Performs intra cluster analysis, returning avg and max of distances of points within a cluster.
    
    Args:
        X: list of all sentences
        labels: list of all cluster labels corresponding to the sentence in X 
            (examples: ["ANC", "Miscellaneous", ...] OR [<list of semantic frames>]
        metric: metric string as understood by sklearn (manhattan / cosine / euclidean)

    Returns:
        dict with {cluster_label : {avg, max}}
    """
    # make sure labels are unique so we don't duplicate computation
    unique_labels = np.unique(labels)
    result = {label : {} for label in unique_labels}
    
    # add all sentences that fit a label
    classified_sentences = {label : [] for label in unique_labels}
    for x, label in zip(X, labels):
        classified_sentences[label].append(x)
        
    # compute intra-cluster distances
    for label, sentences in classified_sentences.items():
        # pairwise distance calculation
        distances = pairwise_distances(sentences, sentences, metric=metric, n_jobs=-1)
        
        # compute average diameter distance
        n_sentences = len(sentences)
        num_combinations = n_sentences * (n_sentences - 1)
        result[label]["avg"] = np.sum(distances) / num_combinations
        
        # compute complute diameter distance
        result[label]["max"] = np.max(distances)

    return result

def compute_cluster_similarity(framenet_df, embeddings_df, filter_col, metric, output_dir, output_filename):
    """
    Computes measures of cluster similarity within and between corpuses.
    
    Args: 
        framenet_df (Pandas DataFrame): data from FrameNet that contains (1) corpus; (2) documents; (3) semantic frames; and (4) sentences.
        embeddings_df (Pandas DataFrame): sentences and their embeddings.
        filter_col (string): column to do clustering on.
        metric (string): distane meteric to use when computing similarity as understood by sklearn (manhattan / cosine / euclidean).
        output_dir (string): directory to save analysis file to.
        output_filepath (string): filename to save csv to.
    
    Returns:
        (Pandas DataFrame): DataFrame of evaluation metrics for each cluster.
    """
    # create a DataFrame with corpus, sentences, and embeddings
    data_df = pd.merge(framenet_df[[filter_col, "sentence"]].drop_duplicates(),
                       embeddings_df, on="sentence", how="inner")
    X = np.vstack(data_df["embedding"].values)
    labels = data_df[filter_col].values
    
    # compute silhouette value overall and for each corpus
    data_df["silhouette"] = metrics.silhouette_samples(X, labels, metric=metric, n_jobs=-1)
        
    overall_silhouette = np.mean(data_df["silhouette"])
    silhouette_bycorpus = data_df.groupby(filter_col).agg({"silhouette": "mean"}).reset_index()
    
    # compute intra-cluster measures
    intra_cluster_measures = pd.DataFrame(intra_cluster_analysis(X, labels, metric)).T.reset_index()
    
    # combine data into one table
    output = silhouette_bycorpus.merge(intra_cluster_measures, left_on=filter_col, right_on="index",
                                       how="inner")
    del output["index"]
    
    # add overall silhouette
    output = pd.concat([output, pd.DataFrame({
        filter_col: ["OVERALL"],
        "silhouette": [overall_silhouette],
        "avg": [0],
        "max": [0]
    })], ignore_index=True)
    
    # cleanup
    output.rename(columns={"avg": "average_diameter_dist", "max": "complete_diameter_dist"}, inplace=True)
    
    # save out file and return
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)        
    output.to_csv(os.path.join(output_dir, output_filename) + ".csv", index=False)
    
    return output

def fetch_analysis(input_dir, input_filename):
    """
    Fetches a csv file with analysis data in it.
    
    Args:
        input_dir (string): directory where file is located. 
        input_filename (string): name of file to pull.
        
    Returns:
        (Pandas DataFrame): dataframe of parsed csv file specified. 
        
    """
    return pd.read_csv(os.path.join(input_dir, input_filename) + ".csv")

# Analysis: Corpus-Level Clusters

In [10]:
facet = "corpus"
for model_name, model_df in tqdm(model_df_dict.items()):
    compute_cluster_similarity(data_df, model_df, facet, "manhattan", ANALYSIS_DIR,
                               "{0}_{1}-cluster-similarity".format(model_name, facet))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [11]:
# fetch data from biggest model
filename = "bert-large-nli-stsb-mean-tokens_corpus-cluster-similarity"
corpus_cluster_measures_df = fetch_analysis(ANALYSIS_DIR, filename)
corpus_cluster_measures_df

Unnamed: 0,corpus,silhouette,average_diameter_dist,complete_diameter_dist
0,ANC,0.008203,662.529536,836.551697
1,KBEval,-0.004573,660.564252,826.244263
2,LUCorpus-v0.3,-0.011673,674.170072,844.236816
3,Miscellaneous,-0.015831,665.564501,827.955383
4,NTI,0.028401,637.321185,833.647095
5,PropBank,0.0079,660.21151,828.987671
6,WikiTexts,0.016205,656.214338,811.79657
7,OVERALL,0.006814,0.0,0.0


# Analysis: Document-Level Clusters

In [12]:
facet = "document"
for model_name, model_df in tqdm(model_df_dict.items()):
    compute_cluster_similarity(data_df, model_df, facet, "manhattan", ANALYSIS_DIR,
                               "{0}_{1}-cluster-similarity".format(model_name, facet))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [13]:
# fetch data from biggest model
filename = "bert-large-nli-stsb-mean-tokens_document-cluster-similarity"
corpus_cluster_measures_df = fetch_analysis(ANALYSIS_DIR, filename)
corpus_cluster_measures_df

Unnamed: 0,document,silhouette,average_diameter_dist,complete_diameter_dist
0,110CYL067,-0.052127,639.936235,805.591003
1,110CYL068,-0.012957,617.426221,764.979309
2,110CYL069,-0.033120,626.213563,766.615051
3,110CYL070,-0.016470,616.122651,754.894592
4,110CYL072,-0.053550,645.051373,762.660522
...,...,...,...,...
98,utd-icsi,-0.010833,607.506115,807.416504
99,workAdvances,0.024419,577.842899,714.478333
100,wsj_1640.mrg-NEW,0.024678,589.971900,721.458984
101,wsj_2465,-0.025633,614.856052,776.315796


# Analysis: Frame-Level Clusters

In [14]:
facet = "semantic_frame"
for model_name, model_df in tqdm(model_df_dict.items()):
    compute_cluster_similarity(data_df, model_df, facet, "manhattan", ANALYSIS_DIR,
                               "{0}_{1}-cluster-similarity".format(model_name, facet))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [15]:
# fetch data from biggest model
filename = "bert-large-nli-stsb-mean-tokens_semantic_frame-cluster-similarity"
corpus_cluster_measures_df = fetch_analysis(ANALYSIS_DIR, filename)
corpus_cluster_measures_df

Unnamed: 0,semantic_frame,silhouette,average_diameter_dist,complete_diameter_dist
0,Abandonment,-0.249115,610.718471,676.590942
1,Abounding_with,-0.196890,652.660740,774.395813
2,Abundance,-0.255572,653.900269,653.900269
3,Abusing,0.061572,466.111694,466.111694
4,Accompaniment,-0.242196,671.516797,764.844360
...,...,...,...,...
793,Win_prize,0.000000,,0.000000
794,Withdraw_from_participation,-0.060547,515.155699,646.293091
795,Within_distance,0.000000,,0.000000
796,Work,-0.186387,636.969460,765.061157
