In [1]:
# Declare whether you are on Colab or local
colab = True

In [None]:
if colab==True:
  
  #Mount drive
  from google.colab import drive
  drive.mount('/content/drive')

  #set path to data in Google Drive
  data_path = "/content/drive/MyDrive/2022_Analytics Lab Student Projects/Data/All Topics"

  #install required packages
  !pip install duckdb
  !pip install -U sentence-transformers

In [3]:
# Import sentence_transformers 
from sentence_transformers import SentenceTransformer, util

In [4]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import duckdb
import sqlalchemy
#%load_ext sql

### Load data

#### Sentences

In [148]:
# Sentences (text & id)
sentences = pd.read_csv(data_path+"/Fulltext of Corpus/sentences.csv")
sentences_clean = sentences.loc[~sentences.sentence_type.isin(["EMPTY", "TAG"])] #which sentence types can we also ignore???
del sentences
sentences_clean = sentences_clean.drop_duplicates(subset="sentence") # Drop duplicate sentences (we only care about different sentences)
#sentences_clean = sentences_clean[["sentence_id", "sentence"]]

In [149]:
sentences_clean = sentences_clean[["article_id","sentence_id", "sentence"]] # only keep columns of interest

In [7]:
# Filter for article 1 
sentences_art1 = sentences_clean.loc[sentences_clean["article_id"]==1]

In [8]:
# create list to work with similarity function 
# all_sentences = sentences_art1.sentence.tolist()

In [None]:
# sentence ids to decode sentence id to text 
# sent_ids = sentences_art1.sentence_id.to_list()

#### Ontology (entities + synonyms)

In [10]:
if colab==True:
  ontology_syn = pd.read_csv(data_path+"/Synonyms in Ontology/isontology.csv", index_col=0)
else:
  ontology_syn = pd.read_csv("../Data/SynonymsinOntology/isontology.csv", index_col=0)
ontology_syn.shape

(397946, 4)

In [11]:
# Check number of synonyms per entity_id
ent_sy_group = ontology_syn.groupby("entity_id")[["synonym"]].nunique().sort_values(by="synonym", ascending=False)

In [24]:
# Downsample entities with more than 100 synonyms to a 100 synoyms (but always include "main entity_id" in subsample)
idx = ent_sy_group[ent_sy_group["synonym"]>100].index.tolist()

dfs = []
for entity in idx:
  t1 = ontology_syn[(ontology_syn.entity_id==entity)&(ontology_syn.synonym==entity)]
  t2 = ontology_syn[(ontology_syn.entity_id==entity)&(ontology_syn.synonym!=entity)].sample(99)
  dfs.append(pd.concat([t1,t2]))
dfs.append(ontology_syn.loc[~ontology_syn["entity_id"].isin(idx)]) # Append dataframe for all entities with less then 100 synonyms to list

# Combine all dataframes 
ontology_syn_ds = pd.concat(dfs).reset_index(drop=True)

In [31]:
#Show synonyms in ontology which appear in more than 1 entity
ontology_syn_ds[ontology_syn_ds.synonym.duplicated(keep=False)].sort_values(by="synonym")

Unnamed: 0,entity_id,category,label,synonym
25235,Pearson's chi squared test of independence bet...,methodological entity,ANALYSIS_METHOD,Chi2 test for independence
25519,Pearson's Chi square test,methodological entity,ANALYSIS_METHOD,Chi2 test for independence
25230,Pearson's chi squared test of independence bet...,methodological entity,ANALYSIS_METHOD,Chi2 test for independences
25514,Pearson's Chi square test,methodological entity,ANALYSIS_METHOD,Chi2 test for independences
12174,systems operations,domain specific entity,TOPIC,EDP operation
...,...,...,...,...
30010,website,domain specific entity,TECHNOLOGY,web sites
25206,chi squared test,methodological entity,ANALYSIS_METHOD,χ 2
9371,Chi square metric,methodological entity,METRIC,χ 2
25209,chi squared test,methodological entity,ANALYSIS_METHOD,χ 2S


In [32]:
#List of unique entities from ontology with synonyms with downsampling (apparently there are some duplicates in there?!)
#entities_syn_unique_ds = ontology_syn_ds.synonym.unique().tolist()
ontology_syn_ds_unique = ontology_syn_ds.drop_duplicates(subset="synonym")

#### Entities (Sentence_id + tagged entity_id) by Rolands script

In [33]:
# Note that DBeaver must be closed, before you can run this command!
if colab==True:
  con = duckdb.connect(database=data_path+'/Extracted Information of Corpus/isrecon.duckdb', read_only=False)
else:
  con = duckdb.connect(database='../Data/ExtractedInformationofCorpus/isrecon.duckdb', read_only=False)

In [34]:
# Load first article as df
entities_sample_article = con.execute("""
                       SELECT *
                       FROM entities e
                       WHERE e.article_id = 1""").fetchdf()

entities_sample_article.shape

(132, 46)

In [35]:
entities_sample_article.head()

Unnamed: 0,article_id,para_id,sentence_start,sentence_id,section_title,subsection_title,label,ent_id,level_1,level_2,...,attr_double,attr_stage,attr_type,attr_temporal,attr_assumption,attr_dsr,attr_paired,attr_setting,attr_level,attr_tool
0,1,1_3,322,1_322_354,Introduction,,TOPIC,knowledge repository,domain specific entity,IS topic,...,0,0,0,0,0,0,0,0,0,0
1,1,1_3,467,1_467_502,Introduction,,TOPIC,digital platform,domain specific entity,IS topic,...,0,0,0,0,0,0,0,0,0,0
2,1,1_3,502,1_502_527,Introduction,,TECHNOLOGY,Salesforce.com,domain specific entity,IS technology,...,0,0,0,0,0,0,0,0,0,0
3,1,1_4,530,1_530_551,Introduction,,TOPIC,participatory design,domain specific entity,IS topic,...,0,0,0,0,0,0,0,0,0,0
4,1,1_5,861,1_861_886,Introduction,,PARTICIPANTS,group participant,domain specific entity,study object,...,0,0,0,0,0,0,0,0,1,0


### Load model

In [36]:
# Load a pre-trained model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

### Create similartiy functions

Within the sentence tansformer package there are different functions to compute cosine similarity: 
- `util.semantic_search`
- `util.cos_sim`

In [62]:
# create function based on semantic_search
def sem_search_sent_ent(model, sentences, entities, top_k = 1):
  #Compute embeddings
  embeddings_sentences = model.encode(sentences.sentence.to_list(), convert_to_tensor=True, show_progress_bar=True)
  embeddings_entities = model.encode(entities.synonym.to_list(), convert_to_tensor=True, show_progress_bar=True)

  #Compute cosine-similarities for each sentence with all entities and return top k per sentence
  cosine_scores = util.semantic_search(embeddings_sentences, embeddings_entities, top_k=top_k)

  #loop over results from semantic_search and create dataframe with sentence id and store in list
  dfs = []
  for idx, sim in enumerate(cosine_scores):
    test = pd.DataFrame(sim)
    test["sentence_id"] = sentences.iloc[idx,1]
    test["sentence_txt"] = sentences.iloc[idx,2]
    test = test.rename(columns={"corpus_id":"entity_id"})
    dfs.append(test)

  # concat all dfs 
  dfs_df = pd.concat(dfs)
  dfs_df = dfs_df[["score", "sentence_id", "entity_id", "sentence_txt"]] # reorder columns
  dfs_df[["main_entity_id"]] = dfs_df[["entity_id"]].applymap(lambda x:  entities.iloc[x,0])
  dfs_df[["entity_id"]] = dfs_df[["entity_id"]].applymap(lambda x:  entities.iloc[x,3])
  dfs_df = dfs_df.sort_values(by="score", ascending=False, ignore_index=True) # sort by score and reset index

  return dfs_df

In [78]:
def cos_sim_sent_ent(model, sentences, entities):
  '''Compute cosine similarity between sentences and entities. Returns dataframe with score, idx & text of entities and sentences'''

  sentences_list = sentences.sentence.to_list()
  entities_list = entities.synonym.to_list()

  #Compute embeddings
  embeddings_sentences = model.encode(sentences_list, convert_to_tensor=True, show_progress_bar=True)
  embeddings_entities = model.encode(entities_list, convert_to_tensor=True, show_progress_bar=True)

  #Compute cosine-similarities for each sentence with each other entity
  cosine_scores = util.cos_sim(embeddings_sentences, embeddings_entities)
  cosine_scores = cosine_scores.cpu().detach().numpy() #convert to numpy

  # Get shape of cosine scores for indexing sentences and entities
  no_rows = cosine_scores.shape[0]
  no_cols = cosine_scores.shape[1]
  
  # Create array representing indices of the cosine_scores grid for indexing sentences and entities
  row, col = np.indices((no_rows, no_cols))

  # Reshape cosine_score, row and col so that there is one row per comparison (similarity between one sentence and one entity)
  cosine_scores = cosine_scores.reshape(no_rows*no_cols,1)
  row = row.reshape(no_rows*no_cols,1)
  col = col.reshape(no_rows*no_cols,1)

  #Stack arrays
  array_tmp = np.hstack((cosine_scores, row, col))

  # Convert to df and recreate text & Sort scores in decreasing order
  pairs_df = pd.DataFrame(array_tmp, columns=['score', 'sentence_id', 'entity_id']).sort_values(by="score", ascending=False)
  pairs_df[['sentence_id', 'entity_id']] = pairs_df[['sentence_id', 'entity_id']].astype("int")
  pairs_df[["txt_sent"]] = pairs_df[["sentence_id"]].applymap(lambda x:  sentences_list[x])
  pairs_df[["entity_id"]] = pairs_df[["entity_id"]].applymap(lambda x:  entities_list[x])
  pairs_df = pairs_df.reset_index(drop=True)

  return pairs_df

In [135]:
%%time
sim_scores_art1 = sem_search_sent_ent(model, sentences_art1, ontology_syn_ds_unique, top_k=3)

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/1111 [00:00<?, ?it/s]

CPU times: user 10.5 s, sys: 302 ms, total: 10.8 s
Wall time: 10.7 s


In [79]:
%%time
test2 = cos_sim_sent_ent(model, sentences_art1, ontology_syn_ds_unique)

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/1111 [00:00<?, ?it/s]

CPU times: user 25.4 s, sys: 1.54 s, total: 26.9 s
Wall time: 27.2 s


☝ Based on the execution time the semantic search function should be preferred

### Check performance of similarity scores

In order to determine the range of cosine similarity score we want to include for further processing (so we only show matches which are "fairly" similar to entities from ontology to the user) we will compare the "closest" entity to the "true" label from Rolands & Sebastians DB

In [136]:
# Set max column width to None so we can read the whole sentences
pd.set_option('display.max_colwidth', None)

In [137]:
sim_scores_art1.head(2)

Unnamed: 0,score,sentence_id,entity_id,sentence_txt,main_entity_id
0,1.0,1_11252_11254,variable,Variable \n,variable
1,1.0,1_11663_11666,research question,Research Question \n,research question


In [138]:
sim_scores_art1_mapped = sim_scores_art1.merge(entities_sample_article[["sentence_id", "ent_id"]], 
                                               how="left", on="sentence_id")\
                                               .rename(columns={'ent_id':'true_main_ent_id_Roland'})

In [139]:
# Check wheter main_entity based on cosine similarity is identical to "true" main entity label from Rolands & Seastians DB
sim_scores_art1_mapped["correct_match"] = sim_scores_art1_mapped.main_entity_id==sim_scores_art1_mapped.true_main_ent_id_Roland

In [140]:
grouped_df = sim_scores_art1_mapped.groupby(pd.cut(sim_scores_art1_mapped.score, np.arange(0.,1.1,0.1)))[["correct_match"]]
corr_match_bins = np.round(grouped_df.sum()/grouped_df.count(),2).rename(columns={"correct_match":"pct_correct"})
corr_match_bins["no_correct"]= grouped_df.sum()
corr_match_bins["total"]= grouped_df.count()

In [141]:
corr_match_bins

Unnamed: 0_level_0,pct_correct,no_correct,total
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(0.0, 0.1]",,0,0
"(0.1, 0.2]",,0,0
"(0.2, 0.3]",,0,0
"(0.3, 0.4]",0.01,1,95
"(0.4, 0.5]",0.04,21,593
"(0.5, 0.6]",0.05,35,710
"(0.6, 0.7]",0.12,27,232
"(0.7, 0.8]",0.18,12,68
"(0.8, 0.9]",0.67,14,21
"(0.9, 1.0]",0.42,5,12


In [161]:
for i in np.arange(0.1,1.,0.1):
  print("Number of matches a user would need to look at if we choose a similarity score range from",np.round(i,1),"to 1:", sim_scores_art1_mapped.loc[(sim_scores_art1_mapped["score"]>=i)&(sim_scores_art1_mapped["score"]<=0.99)].shape[0])

Number of matches a user would need to look at if we choose a similarity score range from 0.1 to 1: 1727
Number of matches a user would need to look at if we choose a similarity score range from 0.2 to 1: 1727
Number of matches a user would need to look at if we choose a similarity score range from 0.3 to 1: 1727
Number of matches a user would need to look at if we choose a similarity score range from 0.4 to 1: 1632
Number of matches a user would need to look at if we choose a similarity score range from 0.5 to 1: 1039
Number of matches a user would need to look at if we choose a similarity score range from 0.6 to 1: 329
Number of matches a user would need to look at if we choose a similarity score range from 0.7 to 1: 97
Number of matches a user would need to look at if we choose a similarity score range from 0.8 to 1: 29
Number of matches a user would need to look at if we choose a similarity score range from 0.9 to 1: 8


In [155]:
# Lets have a look at the matches with a similarity score >0.6 to assess wheter there are some synonyms/ entities mentioned 
sim_scores_art1_mapped.loc[(sim_scores_art1_mapped["score"]>=0.6)&(sim_scores_art1_mapped["score"]<=0.99)].tail(20)

Unnamed: 0,score,sentence_id,entity_id,sentence_txt,main_entity_id,true_main_ent_id_Roland,correct_match
313,0.603391,1_11512_11515,models,-Model 2 \n,model,,False
314,0.603149,1_1224_1247,business professional communities,Communities are established and managed by individuals or groups and consist of product users of one or more companies in different industries.,group participant,,False
315,0.603086,1_10068_10075,electronic marketplaces,The online user community of eBay Germany,electronic market,,False
316,0.603063,1_6285_6302,causality analyses,"Furthermore, our unique context features and data sources help us mitigate some reverse causality bias.",causality analysis,,False
317,0.602653,1_2898_2918,contribute knowledge,Such norms of reciprocity may facilitate knowledge contribution (as a form of social exchange) between community members.,knowledge sharing,,False
318,0.60252,1_1433_1455,global software developments,"We then investigate the hypotheses using data from the OUC of BMC, a global leader in producing innovative software solutions.",globalization of IS,research hypothesis,False
319,0.602502,1_9917_9918,construct and statistical conclusion validity,Conclusion,two validities,,False
320,0.602356,1_7848_7857,correlation coefficient <,Correlations less than 0.01 are rounded to 0.01.,Pearson correlation coefficient,,False
321,0.602176,1_1292_1333,online communities,Online communities used by the host firms as marketing instruments to know the thoughts of their product users and influence the product users' purchasing behavior Product users are brought together by common interest and inner admiration for a brand.,online community,online community,True
322,0.601971,1_7848_7857,significant positive correlations,Correlations less than 0.01 are rounded to 0.01.,positive correlation,,False


☝ at least for this sample article it seems that there are some sentences which might mention an existing entity but which have not been labeled using Rolands script. This has to be investigated further also taking into consideration fine-tuned models, other articles ...