# KeyBERT
This notebook contains experiments using KeyBERT model and its various setups. It was run on Google Colab. List of keywords were generated in cooperation with ChatGPT and were checked by the Actaware Inc. employees.

In [3]:
import numpy as np
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.metrics import balanced_accuracy_score
import time

  from tqdm.autonotebook import tqdm, trange


In [None]:
path_dir=''

In [4]:
#prompt: "provide 100 key words in the area of incidents which are connected to topic 'human & employee rights, single words" for other topics, prompt was similar
list_keywords_human_employee_rights = list(set(["child labor", "forced labor", "human trafficking", "fair trade", "protecting employee rights", "employees fairly compensated for work",
                                       "discrimination", "harassment", "retaliation", "whistleblowing", "termination","bullying", "equality", "overtime", "safety", "forced",
                                       "child", "wage", "employment", "disability", "gender", "racial", "sexual", "age", "religious", "pregnancy", "unpaid", "theft", "unfair",
                                       "misclassification", "health", "hostile", "freedom", "unionize", "bargaining", "discharge", "privacy", "compensation", "leave",
                                       "FMLA", "OSHA", "FLSA", "ADA", "EEO", "NLRA", "practices", "protections", "violence", "trafficking", "benefits", "security", "unemployment", "redundancy", "grievances", "evaluations"]))

list_keywords_diversity_equity_inclusion=list(set(["gender equity", "racial diversity", "racial equity", "racial inclusion", "ethnic diversity", "ethnic equity", "ethnic inclusion",
                                      "black lives matter", "lgbt", "lgbtq+", "blue lives matter", "diversity", "equity", "inclusion", "belonging", "culture", "bias",
                                      "allyship", "intersectionality", "privilege", "representation", "affirmative", "opportunity", "anti-discrimination", "leadership",
                                      "training", "groups", "ERG","gender", "racial", "accessibility", "disability", "sensitivity", "pay", "workforce", "hiring", "practices", "identity",
                                      "orientation", "culture", "metrics", "audits", "language", "management","age", "ethnic", "policies", "programs",
                                      "workplaces", "strategies", "social", "justice", "unconscious", "implicit", "microaggressions", "competence", "discrimination"]))

list_keywords_environment = list(set([
                                      "adaptation", "afforestation", "agriculture", "air", "aquatic", "biodiversity", "biofuel", "carbon",
                                      "carbon footprint", "climate", "composting", "conservation", "contamination", "deforestation", "degradation",
                                      "desertification", "ecosystem", "emissions", "energy", "erosion", "farming", "forestry", "fossil", "geothermal",
                                      "greenhouse", "habitat", "hazardous", "hydropower", "logging", "marine", "mining", "mitigation",
                                      "natural resource depletion", "nuclear", "nuclear energy", "ocean", "organic", "overfishing", "pollution",
                                      "recycling", "renewables", "resilience", "soil", "soil pollution", "solar", "sustainability", "toxins",
                                      "urbanization", "waste", "water", "water conservation", "water pollution", "wetlands", "wildlife", "wind"]))


list_keywords_animal_care = list(set(["animal testing", "ritual animal slaughter", "animal blood sport", "trophy hunting", "hunting", "fishing", "population control", "welfare", "rescue", "shelter", "adoption",
                                      "veterinary",  "spaying", "neutering", "grooming", "nutrition", "training", "behavior", "protection",
                                      "rights",  "sanctuary",   "fostering",   "breeding",   "rehabilitation", "cruelty", "compassion", "vaccination", "microchipping", "rehoming",
                                      "endangered", "poaching", "conservation", "habitat", "wildlife", "stray", "companionship", "assistance", "puppy", "kitten", "pet", "livestock",
                                      "zoo", "exotic", "circus", "testing", "experimentation", "humane", "enrichment", "hydration", "sheltering", "bonding", "socialization", "equine" "reptile",
                                      "amphibian", "bird"]))

list_keywords_corporate_transparency = list(set(["transparency in management", "transparency in operations", "clarity in management", "clarity in operations", "local tax",
                                      "state tax", "federal tax", "corporate political involvement", "lobbying", "pay gap", "local business", "global corporates",
                                      "small business", "disclosure", "reporting", "governance", "ethics", "accountability", "audit",
                                      "compliance", "integrity", "transparency", "stakeholders", "csr", "sustainability", "regulation", "risk", "oversight", "board", "directors", "strategy", "performance", "metrics", "benchmarking", "disclosure",
                                      "standards", "transparency", "trust", "financials", "annual", "report", "stakeholder", "engagement", "ethical", "integrity", "compliance",
                                      "governance", "esg", "ethics", "regulatory", "disclosure", "non-financial", "transparency", "accountability", "reporting", "disclosure", "oversight", "evaluation",
                                      "risk", "management"]))

list_keywords_business_involvement = list(set(["profit", "firearms", "abortion", "contraceptives", "stem cells", "cannabis", "tobacco", "alcohol", "adult enterntainment",
                                      "hiring undocumented workers", "prisons and immigration detention centers", "highly addictive medication",
                                      "doing business in Russia", "Israel-Hamas conflict", "stakeholder", "csr", "philanthropy", "sustainability", "outreach", "impact", "growth", "development",
                                      "leadership", "community", "support", "partnership", "sponsorship", "innovation", "investment", "engagement", "influence", "networking", "collaboration", "involvement",
                                      "contribution", "advocacy", "volunteering"]))

list_keywords_political_and_religious_views = list(set(["socially and fiscally liberal", "socially and fiscally conservative", "fiscally liberal", "socially conservative",
                                      "fiscally conservative", "socially liberal", "moderate", "atheis", "agnostic", "protestant", "catholic", "orthodox",
                                      "muslim", "jewish", "mormon", "busshist", "hindu", "jain", "sikh", "african traditional", "chinese traditional",
                                      "shintoist", "spiritist", "bahai", "conservative", "liberal", "progressive", "libertarian",
                                      "socialist", "democrat", "republican", "independent", "green", "religious",
                                      "christian", "muslim", "jewish", "hindu",  "buddhist", "sikh", "secular", "faith", "belief", "ideology", "doctrine",
                                      "evangelical", "fundamentalist", "charismatic", "reform"]))

# Find best score

In [None]:
def find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                    list_of_keywords_all_categories,
                                    dataset_name,
                                    list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                    path_to_save_results=path_dir,
                                    path_to_gt='articles_categories_my_gt_2.csv',
                                    list_thresholds=np.arange(1,100)/100):

  # Load KeyBERT model
  kw_model = KeyBERT(model=model_name)

  # Load the data
  with open(path_to_dataset, "r", encoding="utf8") as f:
    list_of_chosen = f.readlines()

  # Extract keywords from each document separately
  keywords_chosen=["" for _ in range(len(list_of_chosen))]
  for i in range(len(list_of_chosen)):
    keywords_chosen[i] = kw_model.extract_keywords(list_of_chosen[i], stop_words='english', top_n=20, keyphrase_ngram_range=(1,2), use_mmr=True, diversity=0.7)

  extracted_keywords_chosen=["" for _ in keywords_chosen]
  for i, keyword in enumerate(keywords_chosen):
    extracted_keywords_chosen[i]=[item[0] for item in keyword]

  # Load a pretrained Sentence Transformer model
  model = SentenceTransformer(model_name)

  # Calculate embeddings for extracted keywords and for list of keywords for each category
  embeddings_for_categories=["" for _ in range(len(list_of_keywords_all_categories))]
  embeddings_keywords = model.encode(extracted_keywords_chosen)
  for i in range(len(list_of_keywords_all_categories)):
    embeddings_for_categories[i]=model.encode(list_of_keywords_all_categories[i])

  # Calculate the embedding similarities (cosine similarities, according to documentation)
  similarities_for_categories=["" for _ in range(len(list_of_keywords_all_categories))]
  for i in range(len(list_of_keywords_all_categories)):
    similarities_for_categories[i]=model.similarity(embeddings_keywords, embeddings_for_categories[i])

  df_scores=pd.DataFrame((np.array(similarities_for_categories[i].T.sum(axis=0)/similarities_for_categories[i].shape[1]) for i in range(len(similarities_for_categories))))

  max_val=df_scores.max(axis=1).max()
  min_val=df_scores.min(axis=1).min()
  diff=max_val-min_val
  df_scores_normalized=(df_scores-min_val)/diff

  df_scores_with_names=df_scores_normalized.T
  df_scores_with_names.columns=list_of_names
  df_scores_with_names

  # save matrix with similarities score of all data to all given categories
  df_scores_with_names.to_csv(f"{path_to_save_results}keybert_similarity_scores_{model_name}_iter{iter}_{dataset_name}.csv")

  df_gt=pd.read_csv(f"{path_to_save_results}{path_to_gt}", sep=';')

  #find best score for given model and keywords, changing only threshold level, from which the record is classified as 'other'
  best_balanced=0
  best_threshold=5
  for threshold in list_thresholds:
    #get name of the column for which the value is the highest
    df_scores_with_names=pd.read_csv(f"{path_to_save_results}keybert_similarity_scores_{model_name}_iter{iter}_{dataset_name}.csv", index_col=0)
    final_predictions=[""for _ in range(df_scores_with_names.shape[0])]
    for i in range(df_scores_with_names.shape[0]):
      #if two highest scores differ by threshold, final_prediction[i]='other'
      scores=sorted(df_scores_with_names.iloc[i], reverse=True)[:2]
      if scores[0]-scores[1]<=threshold:
        final_predictions[i]='other'
      else:
        final_predictions[i]=df_scores_with_names.iloc[i].idxmax()
    df_scores_with_names['final_predictions']=final_predictions
    score=balanced_accuracy_score(df_gt['Category'], df_scores_with_names['final_predictions'])
    if score>best_balanced:
      best_balanced=score
      best_threshold=threshold
      print(threshold)

  #save best results
  df_scores_with_names=pd.read_csv(f"{path_to_save_results}keybert_similarity_scores_{model_name}_iter{iter}_{dataset_name}.csv", index_col=0)
  final_predictions=[""for _ in range(df_scores_with_names.shape[0])]
  for i in range(df_scores_with_names.shape[0]):
    scores=sorted(df_scores_with_names.iloc[i], reverse=True)[:2]
    if scores[0]-scores[1]<=best_threshold:
      final_predictions[i]='other'
    else:
      final_predictions[i]=df_scores_with_names.iloc[i].idxmax()
  df_scores_with_names['final_predictions']=final_predictions
  df_scores_with_names.to_csv(f"{path_to_save_results}keybert_similarity_scores_{model_name}_{best_threshold}_final_predictions_iter_{iter}_{dataset_name}.csv")

  print(f"Result: {model_name}, {best_balanced}, {best_threshold}")
  return model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories


In [None]:
list_of_keywords_all_categories=[list_keywords_political_and_religious_views,
                                    list_keywords_business_involvement,
                                    list_keywords_corporate_transparency,
                                    list_keywords_animal_care,
                                    list_keywords_environment,
                                    list_keywords_human_employee_rights,
                                    list_keywords_diversity_equity_inclusion]

In [None]:
model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset=path_dir+'chosen_articles.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories=find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                        list_of_keywords_all_categories, dataset_name="chosen",
                                        list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                        path_to_save_results=path_dir,
                                        path_to_gt='articles_categories_my_gt_2.csv',
                                        )
    print(f"Time of execution: {time.time()-start_time}")

Model: all-distilroberta-v1, iteration: 0.
0.01
0.02
Result: all-distilroberta-v1, 0.29703733766233764, 0.02
Time of execution: 25.816896438598633
Model: all-distilroberta-v1, iteration: 1.
0.01
0.02
Result: all-distilroberta-v1, 0.29703733766233764, 0.02
Time of execution: 25.54226803779602
Model: all-distilroberta-v1, iteration: 2.
0.01
0.02
Result: all-distilroberta-v1, 0.29703733766233764, 0.02
Time of execution: 27.179518222808838
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.36028589466089467, 0.03
Time of execution: 50.03827691078186
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.36028589466089467, 0.03
Time of execution: 40.752455711364746
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.36028589466089467, 0.03
Time of execution: 35.04085063934326
Model: all-mpnet-base-v2, iteration: 0.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
Result: all-mpnet-base-v2, 0.40738185425685425, 0.01
Time of execution: 42.56565761566162
Model: all-mpnet-base-v2, iteration: 1.
0.01
Result: all-mpnet-base-v2, 0.40738185425685425, 0.01
Time of execution: 35.24252772331238
Model: all-mpnet-base-v2, iteration: 2.
0.01
Result: all-mpnet-base-v2, 0.40738185425685425, 0.01
Time of execution: 34.62218761444092
Model: all-MiniLM-L6-v2, iteration: 0.
0.01
Result: all-MiniLM-L6-v2, 0.34854347041847045, 0.01
Time of execution: 22.676592350006104
Model: all-MiniLM-L6-v2, iteration: 1.
0.01
Result: all-MiniLM-L6-v2, 0.34854347041847045, 0.01
Time of execution: 21.94022798538208
Model: all-MiniLM-L6-v2, iteration: 2.
0.01
Result: all-MiniLM-L6-v2, 0.34854347041847045, 0.01
Time of execution: 22.93700933456421


In [None]:
# chosen cleaned by 4o

model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset=path_dir+'chosen_articles_cleaned_4o.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories=find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                        list_of_keywords_all_categories, dataset_name='chosen_articles_cleaned_4o',
                                        list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                        path_to_save_results=path_dir,
                                        path_to_gt='articles_categories_my_gt_2.csv',
                                        )
    print(f"Time of execution: {time.time()-start_time}")

Model: all-distilroberta-v1, iteration: 0.
0.01
0.02
Result: all-distilroberta-v1, 0.3072646103896104, 0.02
Time of execution: 29.119035482406616
Model: all-distilroberta-v1, iteration: 1.
0.01
0.02
Result: all-distilroberta-v1, 0.3072646103896104, 0.02
Time of execution: 26.534175634384155
Model: all-distilroberta-v1, iteration: 2.
0.01
0.02
Result: all-distilroberta-v1, 0.3072646103896104, 0.02
Time of execution: 23.403409719467163
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.3831168831168831, 0.03
Time of execution: 34.22502136230469
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.3831168831168831, 0.03
Time of execution: 30.664873123168945
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.3831168831168831, 0.03
Time of execution: 32.05384612083435
Model: all-mpnet-base-v2, iteration: 0.
0.01
0.04
Result: all-mpnet-base-v2

In [None]:
# chosen cleaned by me

model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset=path_dir+'chosen_articles_cleaned_by_me.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories=find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                        list_of_keywords_all_categories, dataset_name='chosen_articles_cleaned_by_me',
                                        list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                        path_to_save_results=path_dir,
                                        path_to_gt='articles_categories_my_gt_2.csv',
                                        )
    print(f"Time of execution: {time.time()-start_time}")

Model: all-distilroberta-v1, iteration: 0.
0.01
0.02
Result: all-distilroberta-v1, 0.27967622655122654, 0.02
Time of execution: 25.43524980545044
Model: all-distilroberta-v1, iteration: 1.
0.01
0.02
Result: all-distilroberta-v1, 0.27967622655122654, 0.02
Time of execution: 24.030922651290894
Model: all-distilroberta-v1, iteration: 2.
0.01
0.02
Result: all-distilroberta-v1, 0.27967622655122654, 0.02
Time of execution: 24.45590043067932
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.35458152958152955, 0.03
Time of execution: 34.646382093429565
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.35458152958152955, 0.03
Time of execution: 34.032742500305176
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.35458152958152955, 0.03
Time of execution: 34.866448163986206
Model: all-mpnet-base-v2, iteration: 0.
0.01
Result: all-mpnet-base-v

In [None]:
# chosen cleaned regex

model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset=path_dir+'chosen_articles_cleaned_regex.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories=find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                        list_of_keywords_all_categories, dataset_name='chosen_articles_cleaned_regex',
                                        list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                        path_to_save_results=path_dir,
                                        path_to_gt='articles_categories_my_gt_2.csv',
                                        )
    print(f"Time of execution: {time.time()-start_time}")

Model: all-distilroberta-v1, iteration: 0.
0.01
0.02
Result: all-distilroberta-v1, 0.29356511544011543, 0.02
Time of execution: 26.451321840286255
Model: all-distilroberta-v1, iteration: 1.
0.01
0.02
Result: all-distilroberta-v1, 0.29356511544011543, 0.02
Time of execution: 26.113743782043457
Model: all-distilroberta-v1, iteration: 2.
0.01
0.02
Result: all-distilroberta-v1, 0.29356511544011543, 0.02
Time of execution: 25.47131371498108
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.35681367243867246, 0.03
Time of execution: 36.26931023597717
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.35681367243867246, 0.03
Time of execution: 34.79012393951416
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.35681367243867246, 0.03
Time of execution: 34.92911219596863
Model: all-mpnet-base-v2, iteration: 0.
0.01
Result: all-mpnet-base-v2,

In [None]:
df_scores_with_names

Unnamed: 0,political_and_religious_views,animal_care,business_involvement,corporate_transparency,environment,human_employee_rights,diversity_equity_inclusion,final_predictions
0,0.077390,0.078388,0.099580,0.065689,0.055581,0.172107,0.122921,human_employee_rights
1,0.026886,0.079234,0.092243,0.042882,0.056896,0.083447,0.078912,business_involvement
2,0.086969,0.080755,0.070846,0.090336,0.043996,0.134752,0.102169,human_employee_rights
3,0.172227,0.096573,0.098061,0.107360,0.128381,0.122578,0.146630,political_and_religious_views
4,0.091778,0.188185,0.205480,0.115539,0.135787,0.171212,0.152866,business_involvement
...,...,...,...,...,...,...,...,...
95,0.079628,0.073181,0.075126,0.071921,0.062976,0.054749,0.057298,political_and_religious_views
96,-0.006955,0.055762,0.140844,-0.013600,0.013151,0.042177,0.051494,business_involvement
97,0.108668,0.119787,0.113463,0.095964,0.172967,0.107014,0.095658,environment
98,0.077760,0.091609,0.099716,0.082421,0.048733,0.111477,0.065519,human_employee_rights


# Check keywords from KeyBERT

In [12]:
def find_keywords_for_given_model(model_name, path_to_dataset, iter,
                                    dataset_name,
                                    path_to_save_results=path_dir,
                                  ):

  # Load KeyBERT model
  kw_model = KeyBERT(model=model_name)

  # Load the data
  with open(path_to_dataset, "r", encoding="utf8") as f:
    list_of_chosen = f.readlines()

  # Extract keywords from each document separately
  keywords_chosen=["" for _ in range(len(list_of_chosen))]
  for i in range(len(list_of_chosen)):
    keywords_chosen[i] = kw_model.extract_keywords(list_of_chosen[i], stop_words='english', top_n=20, keyphrase_ngram_range=(1,2), use_mmr=True, diversity=0.7)

  extracted_keywords_chosen=["" for _ in keywords_chosen]
  for i, keyword in enumerate(keywords_chosen):
    extracted_keywords_chosen[i]=[item[0] for item in keyword]

  with open(f"{path_to_save_results}_keybert_keywords_extracted_{model_name}_{iter}_{dataset_name}.txt", "w") as f:
    for keywords in extracted_keywords_chosen:
      f.write(str(keywords) + "\n")

  return extracted_keywords_chosen


## Save created embeddings

In [13]:
model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset=path_dir+'chosen_articles.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    extracted_keywords_chosen=find_keywords_for_given_model(model_name, path_to_dataset, iter, dataset_name='chosen_articles',
                                        path_to_save_results=path_dir,
                                        )
    print(f"Time of execution: {time.time()-start_time}")

Model: all-distilroberta-v1, iteration: 0.
Time of execution: 20.789751291275024
Model: all-distilroberta-v1, iteration: 1.
Time of execution: 22.551756381988525
Model: all-distilroberta-v1, iteration: 2.
Time of execution: 20.967788219451904
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Time of execution: 46.43827700614929
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
Time of execution: 31.723196983337402
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
Time of execution: 32.044692516326904
Model: all-mpnet-base-v2, iteration: 0.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Time of execution: 36.507548809051514
Model: all-mpnet-base-v2, iteration: 1.
Time of execution: 30.067622900009155
Model: all-mpnet-base-v2, iteration: 2.
Time of execution: 30.887082815170288
Model: all-MiniLM-L6-v2, iteration: 0.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Time of execution: 21.982617139816284
Model: all-MiniLM-L6-v2, iteration: 1.
Time of execution: 18.86784839630127
Model: all-MiniLM-L6-v2, iteration: 2.
Time of execution: 18.54068922996521


## Find scores for keywords extraction

In [69]:
def find_score_for_given_model_incidents(model_name, path_to_dataset, iter, dataset_name):

  # Load KeyBERT model
  kw_model = KeyBERT(model=model_name)

  # Load the data
  with open(path_to_dataset, "r", encoding="utf8") as f:
    list_of_chosen = f.readlines()

  # Extract keywords from each document separately
  keywords_chosen=["" for _ in range(len(list_of_chosen))]
  for i in range(len(list_of_chosen)):
    keywords_chosen[i] = kw_model.extract_keywords(list_of_chosen[i], stop_words='english', top_n=20, keyphrase_ngram_range=(1,2), use_mmr=True, diversity=0.7)

  extracted_keywords_chosen=["" for _ in keywords_chosen]
  for i, keyword in enumerate(keywords_chosen):
    extracted_keywords_chosen[i]=[item[0] for item in keyword]

  # Load a pretrained Sentence Transformer model
  model = SentenceTransformer(model_name)

  #calculate embeddings for each article
  embeddings_keywords = model.encode(extracted_keywords_chosen)
  embeddings_articles=["" for _ in range(len(list_of_chosen))]
  for i in range(len(list_of_chosen)):
    embeddings_articles[i]=model.encode(list_of_chosen[i])

  # Calculate the embedding similarities (cosine similarities, according to documentation)
  similarities_for_articles=["" for _ in range(len(embeddings_articles))]
  for i in range(len(embeddings_articles)):
    similarities_for_articles[i]=model.similarity(embeddings_keywords[i], embeddings_articles[i])
  return sum(similarities_for_articles)/len(similarities_for_articles)

In [64]:
#not preprocessed
model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset=path_dir+'chosen_articles.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    similarity_score=find_score_for_given_model_incidents(model_name, path_to_dataset, iter, dataset_name='chosen_test')
    print(f"Time of execution: {time.time()-start_time}")
    print(similarity_score)

Model: all-distilroberta-v1, iteration: 0.
Time of execution: 25.896260261535645
tensor([[0.6261]])
Model: all-distilroberta-v1, iteration: 1.
Time of execution: 27.04471707344055
tensor([[0.6261]])
Model: all-distilroberta-v1, iteration: 2.
Time of execution: 24.504000425338745
tensor([[0.6261]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.
Time of execution: 36.61450147628784
tensor([[29.5468]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
Time of execution: 36.18279480934143
tensor([[29.5468]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
Time of execution: 36.67815279960632
tensor([[29.5468]])
Model: all-mpnet-base-v2, iteration: 0.
Time of execution: 35.1727569103241
tensor([[0.6846]])
Model: all-mpnet-base-v2, iteration: 1.
Time of execution: 35.20647406578064
tensor([[0.6846]])
Model: all-mpnet-base-v2, iteration: 2.
Time of execution: 35.27614450454712
tensor([[0.6846]])
Model: all-MiniLM-L6-v2, iteration: 0.
Time of execution: 23.948878526687622
tensor([[0.6723]])


In [65]:
# cleaned by 4o
model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset=path_dir+'chosen_articles_cleaned_4o.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    similarity_score=find_score_for_given_model_incidents(model_name, path_to_dataset, iter, dataset_name='chosen_test')
    print(f"Time of execution: {time.time()-start_time}")
    print(similarity_score)

Model: all-distilroberta-v1, iteration: 0.
Time of execution: 22.657753705978394
tensor([[0.6199]])
Model: all-distilroberta-v1, iteration: 1.
Time of execution: 23.7476646900177
tensor([[0.6199]])
Model: all-distilroberta-v1, iteration: 2.
Time of execution: 22.77377152442932
tensor([[0.6199]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.
Time of execution: 32.54566240310669
tensor([[29.7002]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
Time of execution: 33.47027111053467
tensor([[29.7002]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
Time of execution: 33.023459672927856
tensor([[29.7002]])
Model: all-mpnet-base-v2, iteration: 0.
Time of execution: 32.19431805610657
tensor([[0.6863]])
Model: all-mpnet-base-v2, iteration: 1.
Time of execution: 33.48556399345398
tensor([[0.6863]])
Model: all-mpnet-base-v2, iteration: 2.
Time of execution: 31.997108221054077
tensor([[0.6863]])
Model: all-MiniLM-L6-v2, iteration: 0.
Time of execution: 19.258843898773193
tensor([[0.6809]])

In [66]:
# cleaned by human
model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset=path_dir+'chosen_articles_cleaned_by_me.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    similarity_score=find_score_for_given_model_incidents(model_name, path_to_dataset, iter, dataset_name='chosen_test')
    print(f"Time of execution: {time.time()-start_time}")
    print(similarity_score)

Model: all-distilroberta-v1, iteration: 0.
Time of execution: 24.94123363494873
tensor([[0.6263]])
Model: all-distilroberta-v1, iteration: 1.
Time of execution: 23.71804642677307
tensor([[0.6263]])
Model: all-distilroberta-v1, iteration: 2.
Time of execution: 24.35492515563965
tensor([[0.6263]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.
Time of execution: 35.934459924697876
tensor([[29.5001]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
Time of execution: 35.23258423805237
tensor([[29.5001]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
Time of execution: 37.415947914123535
tensor([[29.5001]])
Model: all-mpnet-base-v2, iteration: 0.
Time of execution: 35.24811315536499
tensor([[0.6796]])
Model: all-mpnet-base-v2, iteration: 1.
Time of execution: 34.753650188446045
tensor([[0.6796]])
Model: all-mpnet-base-v2, iteration: 2.
Time of execution: 34.5194411277771
tensor([[0.6796]])
Model: all-MiniLM-L6-v2, iteration: 0.
Time of execution: 20.835763216018677
tensor([[0.6736]])

In [67]:
# cleaned using regex
model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset=path_dir+'chosen_articles_cleaned_regex.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    similarity_score=find_score_for_given_model_incidents(model_name, path_to_dataset, iter, dataset_name='chosen_test')
    print(f"Time of execution: {time.time()-start_time}")
    print(similarity_score)

Model: all-distilroberta-v1, iteration: 0.
Time of execution: 25.59615206718445
tensor([[0.6277]])
Model: all-distilroberta-v1, iteration: 1.
Time of execution: 25.100911378860474
tensor([[0.6277]])
Model: all-distilroberta-v1, iteration: 2.
Time of execution: 24.0419762134552
tensor([[0.6277]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.
Time of execution: 37.6282103061676
tensor([[29.5711]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
Time of execution: 35.079519748687744
tensor([[29.5711]])
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
Time of execution: 36.23842477798462
tensor([[29.5711]])
Model: all-mpnet-base-v2, iteration: 0.
Time of execution: 34.56994605064392
tensor([[0.6850]])
Model: all-mpnet-base-v2, iteration: 1.
Time of execution: 35.258224964141846
tensor([[0.6850]])
Model: all-mpnet-base-v2, iteration: 2.
Time of execution: 34.55417513847351
tensor([[0.6850]])
Model: all-MiniLM-L6-v2, iteration: 0.
Time of execution: 21.264365434646606
tensor([[0.6743]])
