# Evaluation of Frames in Semantic Space







## Import libraries

In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn import metrics

I0528 18:28:48.515056 4590175680 file_utils.py:39] PyTorch version 1.5.0 available.
  from ._conv import register_converters as _register_converters


## Library functions

## Import FrameNet data

In [17]:
FRAMENET_DATA_DIR = "../data/parsed_data/parsed_framenet.p"
EMBEDDINGS_DATA_DIR = "./results/embeddings.p"

data_df = pd.read_pickle(FRAMENET_DATA_DIR)

## Import models and save embeddings

In [4]:
# sentence_bert = SentenceTransformer('bert-large-nli-stsb-mean-tokens')
# sentences = data_df["sentence"].unique()
# embeddings = sentence_bert.encode(sentences)

# embeddings_df = pd.DataFrame({
#     "sentence": sentences,
#     "embedding": embeddings
# })
# embeddings_df.to_pickle(EMBEDDINGS_DATA_DIR)

## Import embeddings

In [18]:
embeddings_df = pd.read_pickle(EMBEDDINGS_DATA_DIR)
embeddings_df.head()

Unnamed: 0,sentence,embedding
0,"I have completed the invoices for April, May a...","[-0.09920554, 0.4483571, 0.41333553, -0.380651..."
1,I am waiting to hear back from Patti on May an...,"[0.61390066, 0.5339698, 0.13365825, 0.5345548,..."
2,Do you want me to pay Pasadena on Friday for t...,"[0.42727828, 0.29827163, 0.90462327, 0.5840608..."
3,"Again , I do not have all of the information f...","[1.0356739, 0.5165068, 1.3579259, -0.06432419,..."
4,If I go by what is currently in the system as ...,"[-0.09869083, 1.3356009, 0.105985045, -0.58211..."


## Corpus analysis

In [7]:
########### NAIVE SILHOUTTE COEFFICIENT ##########

# question -- does the vector have to be normalized to use cosine similarity??

corpus_df = pd.merge(embeddings_df, data_df[["corpus", "sentence"]].drop_duplicates(), on="sentence", how="inner")
print(corpus_df["corpus"].unique())
print(sum(embeddings_df["embedding"].values[0]))
corpus_df_no_misc = corpus_df[corpus_df["corpus"] != "Miscellaneous"]
corpus_df_no_nti = corpus_df[corpus_df["corpus"] != "NTI"]
print(corpus_df_no_misc["corpus"].unique())
print(metrics.silhouette_score(np.vstack(corpus_df["embedding"].values), corpus_df["corpus"].values, metric="cosine"))
print(metrics.silhouette_score(np.vstack(corpus_df_no_misc["embedding"].values), corpus_df_no_misc["corpus"].values, metric="cosine"))
print(metrics.silhouette_score(np.vstack(corpus_df_no_nti["embedding"].values), corpus_df_no_nti["corpus"].values, metric="cosine"))

print(len(metrics.silhouette_samples(np.vstack(corpus_df["embedding"].values), corpus_df["corpus"].values, metric="cosine")))

corpus_df["silhouette"] = metrics.silhouette_samples(np.vstack(corpus_df["embedding"].values), corpus_df["corpus"].values, metric="cosine")

corpus_df.groupby("corpus").agg({"silhouette": "mean"})


['LUCorpus-v0.3' 'ANC' 'WikiTexts' 'NTI' 'PropBank' 'Miscellaneous'
 'KBEval']
2.1818687996128574
['LUCorpus-v0.3' 'ANC' 'WikiTexts' 'NTI' 'PropBank' 'KBEval']
0.010872502
0.019707711
0.007942511
4967


Unnamed: 0_level_0,silhouette
corpus,Unnamed: 1_level_1
ANC,0.017961
KBEval,-0.010111
LUCorpus-v0.3,-0.02241
Miscellaneous,-0.031589
NTI,0.043976
PropBank,0.015435
WikiTexts,0.038064


## Document Analysis

In [8]:
only_ANC = data_df[data_df["corpus"] == "ANC"][["sentence", "document"]].drop_duplicates()
document_df = pd.merge(embeddings_df, only_ANC, on="sentence", how="inner")
print(document_df["document"].unique())
print(metrics.silhouette_score(np.vstack(document_df["embedding"].values), document_df["document"].values, metric="cosine"))

document_df["silhouette"] = metrics.silhouette_samples(np.vstack(document_df["embedding"].values), document_df["document"].values, metric="cosine")

document_df.groupby("document").agg({"silhouette": "mean", "sentence": "count"})


['HistoryOfLasVegas' '110CYL200' 'chapter1_911report' '112C-L013'
 'journal_christine' '110CYL070' '110CYL067' 'StephanopoulosCrimes'
 '110CYL072' 'HistoryOfGreece' 'HistoryOfJerusalem' '110CYL069'
 'WhereToHongKong' '110CYL068' 'IntroOfDublin' 'EntrepreneurAsMadonna'
 'IntroHongKong' 'WhatToHongKong' 'IntroJamaica']
0.015043921


Unnamed: 0_level_0,silhouette,sentence
document,Unnamed: 1_level_1,Unnamed: 2_level_1
110CYL067,-0.072538,39
110CYL068,0.01521,30
110CYL069,-0.040206,39
110CYL070,-0.020467,18
110CYL072,-0.087091,12
110CYL200,0.029374,20
112C-L013,0.009391,25
EntrepreneurAsMadonna,0.012219,30
HistoryOfGreece,0.045248,134
HistoryOfJerusalem,0.086931,113


## Frame analysis: HongKong and count > 20

In [15]:
frames_df = data_df[data_df["document"]=="WhereToHongKong"][["sentence", "semantic_frame"]]

count_df = frames_df.groupby("semantic_frame").agg("count")
frame_names = count_df[count_df["sentence"]>20].index.values

frames_df = frames_df[frames_df["semantic_frame"].isin(frame_names)]

print(frame_names)

frames_df = pd.merge(embeddings_df, frames_df, how="inner", on="sentence")

print(metrics.silhouette_score(np.vstack(frames_df["embedding"].values), frames_df["semantic_frame"].values, metric="manhattan"))
frames_df["silhouette"] = metrics.silhouette_samples(np.vstack(frames_df["embedding"].values), frames_df["semantic_frame"].values, metric="manhattan")

frames_df.groupby("semantic_frame").agg({"silhouette": "mean"})

['Age' 'Aggregate' 'Arriving' 'Building' 'Buildings' 'Calendric_unit'
 'Cardinal_numbers' 'Desirability' 'Existence' 'Food' 'Frequency'
 'Increment' 'Interior_profile_relation' 'Leadership' 'Locale'
 'Locale_by_use' 'Locative_relation' 'Measure_duration' 'Natural_features'
 'Origin' 'Part_orientational' 'People' 'Physical_artworks'
 'Political_locales' 'Possibility' 'Quantified_mass' 'Roadways'
 'Self_motion' 'Size' 'Spatial_contact' 'Temporal_collocation' 'Touring'
 'Vehicle']
-0.04239978413401968


Unnamed: 0_level_0,silhouette
semantic_frame,Unnamed: 1_level_1
Age,-0.040933
Aggregate,-0.064323
Arriving,-0.043442
Building,-0.050173
Buildings,-0.050338
Calendric_unit,-0.043079
Cardinal_numbers,-0.05728
Desirability,-0.05824
Existence,-0.065526
Food,-0.016235


## Intersection analysis