In [1]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.

In [2]:
import numpy as np
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import balanced_accuracy_score
import time

  from tqdm.autonotebook import tqdm, trange


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#prompt: "provide 100 key words in the area of incidents which are connected to topic 'human & employee rights, single words"
list_keywords_human_employee_rights = list(set(["child labor", "forced labor", "human trafficking", "fair trade", "protecting employee rights", "employees fairly compensated for work",
                                       "discrimination", "harassment", "retaliation", "whistleblowing", "termination","bullying", "equality", "overtime", "safety", "forced",
                                       "child", "wage", "employment", "disability", "gender", "racial", "sexual", "age", "religious", "pregnancy", "unpaid", "theft", "unfair",
                                       "misclassification", "health", "hostile", "freedom", "unionize", "bargaining", "discharge", "privacy", "compensation", "leave",
                                       "FMLA", #Family and Medical Leave Act
                                       "OSHA", #Occupation Safety and Health Administration
                                       "FLSA", #Fair Labor Standards Act
                                       "ADA", #American with Disabilities Act
                                       "EEO", #Equal Employment Opportunity
                                       "NLRA", #National Labor Relation Act
                                       "practices", "protections", "violence", "trafficking", "benefits", "security", "unemployment", "redundancy", "grievances", "evaluations"]))

list_keywords_diversity_equity_inclusion=list(set(["gender equity", "racial diversity", "racial equity", "racial inclusion", "ethnic diversity", "ethnic equity", "ethnic inclusion",
                                      "black lives matter", "lgbt", "lgbtq+", "blue lives matter", "diversity", "equity", "inclusion", "belonging", "culture", "bias",
                                      "allyship", "intersectionality", "privilege", "representation", "affirmative", "opportunity", "anti-discrimination", "leadership",
                                      "training", "groups", "ERG", #Employee Resource Group
                                      "gender", "racial", "accessibility", "disability", "sensitivity", "pay", "workforce", "hiring", "practices", "identity",
                                      "orientation", "culture", "metrics", "audits", "language", "management","age", "ethnic", "policies", "programs",
                                      "workplaces", "strategies", "social", "justice", "unconscious", "implicit", "microaggressions", "competence", "discrimination"]))

list_keywords_environment = list(set([
                                      "adaptation", "afforestation", "agriculture", "air", "aquatic", "biodiversity", "biofuel", "carbon",
                                      "carbon footprint", "climate", "composting", "conservation", "contamination", "deforestation", "degradation",
                                      "desertification", "ecosystem", "emissions", "energy", "erosion", "farming", "forestry", "fossil", "geothermal",
                                      "greenhouse", "habitat", "hazardous", "hydropower", "logging", "marine", "mining", "mitigation",
                                      "natural resource depletion", "nuclear", "nuclear energy", "ocean", "organic", "overfishing", "pollution",
                                      "recycling", "renewables", "resilience", "soil", "soil pollution", "solar", "sustainability", "toxins",
                                      "urbanization", "waste", "water", "water conservation", "water pollution", "wetlands", "wildlife", "wind"]))


list_keywords_animal_care = list(set(["animal testing", "ritual animal slaughter", "animal blood sport", "trophy hunting", "hunting", "fishing", "population control", "welfare", "rescue", "shelter", "adoption",
                                      "veterinary",  "spaying", "neutering", "grooming", "nutrition", "training", "behavior", "protection",
                                      "rights",  "sanctuary",   "fostering",   "breeding",   "rehabilitation", "cruelty", "compassion", "vaccination", "microchipping", "rehoming",
                                      "endangered", "poaching", "conservation", "habitat", "wildlife", "stray", "companionship", "assistance", "puppy", "kitten", "pet", "livestock",
                                      "zoo", "exotic", "circus", "testing", "experimentation", "humane", "enrichment", "hydration", "sheltering", "bonding", "socialization", "equine" "reptile",
                                      "amphibian", "bird"]))

list_keywords_corporate_transparency = list(set(["transparency in management", "transparency in operations", "clarity in management", "clarity in operations", "local tax",
                                      "state tax", "federal tax", "corporate political involvement", "lobbying", "pay gap", "local business", "global corporates",
                                      "small business", "disclosure", "reporting", "governance", "ethics", "accountability", "audit",
                                      "compliance", "integrity", "transparency", "stakeholders", "csr", #Corporate Social Responsibility
                                      "sustainability", "regulation", "risk", "oversight", "board", "directors", "strategy", "performance", "metrics", "benchmarking", "disclosure",
                                      "standards", "transparency", "trust", "financials", "annual", "report", "stakeholder", "engagement", "ethical", "integrity", "compliance",
                                      "governance", "esg", #Environmental, Social responsibility, Corporate governance
                                      "ethics", "regulatory", "disclosure", "non-financial", "transparency", "accountability", "reporting", "disclosure", "oversight", "evaluation",
                                      "risk", "management"]))

list_keywords_business_involvement = list(set(["profit", "firearms", "abortion", "contraceptives", "stem cells", "cannabis", "tobacco", "alcohol", "adult enterntainment",
                                      "hiring undocumented workers", "prisons and immigration detention centers", "highly addictive medication",
                                      "doing business in Russia", "Israel-Hamas conflict", "stakeholder", "csr", "philanthropy", "sustainability", "outreach", "impact", "growth", "development",
                                      "leadership", "community", "support", "partnership", "sponsorship", "innovation", "investment", "engagement", "influence", "networking", "collaboration", "involvement",
                                      "contribution", "advocacy", "volunteering"]))

list_keywords_political_and_religious_views = list(set(["socially and fiscally liberal", "socially and fiscally conservative", "fiscally liberal", "socially conservative",
                                      "fiscally conservative", "socially liberal", "moderate", "atheis", "agnostic", "protestant", "catholic", "orthodox",
                                      "muslim", "jewish", "mormon", "busshist", "hindu", "jain", "sikh", "african traditional", "chinese traditional",
                                      "shintoist", "spiritist", "bahai", "conservative", "liberal", "progressive", "libertarian",
                                      "socialist", "democrat", "republican", "independent", "green", "religious",
                                      "christian", "muslim", "jewish", "hindu",  "buddhist", "sikh", "secular", "faith", "belief", "ideology", "doctrine",
                                      "evangelical", "fundamentalist", "charismatic", "reform"]))

In [6]:
def find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                    list_of_keywords_all_categories,
                                    dataset_name,
                                    list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                    path_to_save_results='/source_repository/',
                                    path_to_gt='articles_categories_my_gt_2.csv',
                                    list_thresholds=np.arange(1,100)/100):

  # Load KeyBERT model
  kw_model = KeyBERT(model=model_name)

  # Load the data
  with open(path_to_dataset, "r", encoding="utf8") as f:
    list_of_chosen = f.readlines()

  # Extract keywords from each document separately
  keywords_chosen=["" for _ in range(len(list_of_chosen))]
  for i in range(len(list_of_chosen)):
    keywords_chosen[i] = kw_model.extract_keywords(list_of_chosen[i], stop_words='english', top_n=20, keyphrase_ngram_range=(1,2), use_mmr=True, diversity=0.7)

  extracted_keywords_chosen=["" for _ in keywords_chosen]
  for i, keyword in enumerate(keywords_chosen):
    extracted_keywords_chosen[i]=[item[0] for item in keyword]

  # Load a pretrained Sentence Transformer model
  model = SentenceTransformer(model_name)

  # Calculate embeddings for extracted keywords and for list of keywords for each category
  embeddings_for_categories=["" for _ in range(len(list_of_keywords_all_categories))]
  embeddings_keywords = model.encode(extracted_keywords_chosen)
  for i in range(len(list_of_keywords_all_categories)):
    embeddings_for_categories[i]=model.encode(list_of_keywords_all_categories[i])

  # Calculate the embedding similarities (cosine similarities, according to documentation)
  similarities_for_categories=["" for _ in range(len(list_of_keywords_all_categories))]
  for i in range(len(list_of_keywords_all_categories)):
    similarities_for_categories[i]=model.similarity(embeddings_keywords, embeddings_for_categories[i])

  df_scores=pd.DataFrame((np.array(similarities_for_categories[i].T.sum(axis=0)/similarities_for_categories[i].shape[1]) for i in range(len(similarities_for_categories))))

  max_val=df_scores.max(axis=1).max()
  min_val=df_scores.min(axis=1).min()
  diff=max_val-min_val
  df_scores_normalized=(df_scores-min_val)/diff

  df_scores_with_names=df_scores_normalized.T
  df_scores_with_names.columns=list_of_names
  df_scores_with_names

  # save matrix with similarities score of all data to all given categories
  df_scores_with_names.to_csv(f"{path_to_save_results}keybert_similarity_scores_{model_name}_iter{iter}_{dataset_name}.csv")

  df_gt=pd.read_csv(f"{path_to_save_results}{path_to_gt}", sep=';')

  #find best score for given model and keywords, changing only threshold level, from which the record is classified as 'other'
  best_balanced=0
  best_threshold=5
  for threshold in list_thresholds:
    #get name of the column for which the value is the highest
    df_scores_with_names=pd.read_csv(f"{path_to_save_results}keybert_similarity_scores_{model_name}_iter{iter}_{dataset_name}.csv", index_col=0)
    final_predictions=[""for _ in range(df_scores_with_names.shape[0])]
    for i in range(df_scores_with_names.shape[0]):
      #if two highest scores differ by threshold, final_prediction[i]='other'
      scores=sorted(df_scores_with_names.iloc[i], reverse=True)[:2]
      if scores[0]-scores[1]<=threshold:
        final_predictions[i]='other'
      else:
        final_predictions[i]=df_scores_with_names.iloc[i].idxmax()
    df_scores_with_names['final_predictions']=final_predictions
    score=balanced_accuracy_score(df_gt['Category'], df_scores_with_names['final_predictions'])
    if score>best_balanced:
      best_balanced=score
      best_threshold=threshold
      print(threshold)

  #save best results
  df_scores_with_names=pd.read_csv(f"{path_to_save_results}keybert_similarity_scores_{model_name}_iter{iter}_{dataset_name}.csv", index_col=0)
  final_predictions=[""for _ in range(df_scores_with_names.shape[0])]
  for i in range(df_scores_with_names.shape[0]):
    scores=sorted(df_scores_with_names.iloc[i], reverse=True)[:2]
    if scores[0]-scores[1]<=best_threshold:
      final_predictions[i]='other'
    else:
      final_predictions[i]=df_scores_with_names.iloc[i].idxmax()
  df_scores_with_names['final_predictions']=final_predictions
  df_scores_with_names.to_csv(f"{path_to_save_results}keybert_similarity_scores_{model_name}_{best_threshold}_final_predictions_iter_{iter}_{dataset_name}.csv")

  print(f"Result: {model_name}, {best_balanced}, {best_threshold}")
  return model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories


In [7]:
list_of_keywords_all_categories=[list_keywords_political_and_religious_views,
                                    list_keywords_business_involvement,
                                    list_keywords_corporate_transparency,
                                    list_keywords_animal_care,
                                    list_keywords_environment,
                                    list_keywords_human_employee_rights,
                                    list_keywords_diversity_equity_inclusion]

In [9]:
model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset='/source_repository/chosen_articles.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories=find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                        list_of_keywords_all_categories, dataset_name='chosen_articles',
                                        list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                        path_to_save_results='/source_repository/',
                                        path_to_gt='articles_categories_my_gt_2.csv',
                                        )
    print(f"Time of execution: {time.time()-start_time}")

Model: all-distilroberta-v1, iteration: 0.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
0.02
Result: all-distilroberta-v1, 0.3014565295815296, 0.02
Time of execution: 32.12618374824524
Model: all-distilroberta-v1, iteration: 1.
0.01
0.02
Result: all-distilroberta-v1, 0.3014565295815296, 0.02
Time of execution: 20.398741722106934
Model: all-distilroberta-v1, iteration: 2.
0.01
0.02
Result: all-distilroberta-v1, 0.3014565295815296, 0.02
Time of execution: 20.284314155578613
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.3637581168831169, 0.03
Time of execution: 34.15167212486267
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
0.01
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.3637581168831169, 0.03
Time of execution: 29.908243894577026
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
0.01
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.3637581168831169, 0.03
Time of execution: 29.4464910030365
Model: all-mpnet-base-v2, iteration: 0.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
Result: all-mpnet-base-v2, 0.38465458152958154, 0.01
Time of execution: 32.04374623298645
Model: all-mpnet-base-v2, iteration: 1.
0.01
Result: all-mpnet-base-v2, 0.38465458152958154, 0.01
Time of execution: 29.042776584625244
Model: all-mpnet-base-v2, iteration: 2.
0.01
Result: all-mpnet-base-v2, 0.38465458152958154, 0.01
Time of execution: 29.02233600616455
Model: all-MiniLM-L6-v2, iteration: 0.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
0.03
Result: all-MiniLM-L6-v2, 0.35454094516594514, 0.03
Time of execution: 19.520219326019287
Model: all-MiniLM-L6-v2, iteration: 1.
0.01
0.03
Result: all-MiniLM-L6-v2, 0.35454094516594514, 0.03
Time of execution: 17.704479455947876
Model: all-MiniLM-L6-v2, iteration: 2.
0.01
0.03
Result: all-MiniLM-L6-v2, 0.35454094516594514, 0.03
Time of execution: 17.057485103607178


In [8]:
# chosen cleaned by 4o

model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset='/source_repository/chosen_articles_cleaned_4o.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories=find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                        list_of_keywords_all_categories, dataset_name='chosen_articles_cleaned_4o',
                                        list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                        path_to_save_results='/source_repository/',
                                        path_to_gt='articles_categories_my_gt_2.csv',
                                        )
    print(f"Time of execution: {time.time()-start_time}")

Model: all-distilroberta-v1, iteration: 0.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
Result: all-distilroberta-v1, 0.31957521645021647, 0.01
Time of execution: 41.25874853134155
Model: all-distilroberta-v1, iteration: 1.
0.01
Result: all-distilroberta-v1, 0.31957521645021647, 0.01
Time of execution: 20.093698501586914
Model: all-distilroberta-v1, iteration: 2.
0.01
Result: all-distilroberta-v1, 0.31957521645021647, 0.01
Time of execution: 20.770009517669678
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
0.02
0.03
0.04
0.05
Result: multi-qa-mpnet-base-dot-v1, 0.3995310245310245, 0.05
Time of execution: 32.92896842956543
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
0.01
0.02
0.03
0.04
0.05
Result: multi-qa-mpnet-base-dot-v1, 0.3995310245310245, 0.05
Time of execution: 30.456493854522705
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
0.01
0.02
0.03
0.04
0.05
Result: multi-qa-mpnet-base-dot-v1, 0.3995310245310245, 0.05
Time of execution: 29.388723611831665
Model: all-mpnet-base-v2, iteration: 0.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
0.04
0.05
Result: all-mpnet-base-v2, 0.3874323593073593, 0.05
Time of execution: 32.03138089179993
Model: all-mpnet-base-v2, iteration: 1.
0.01
0.04
0.05
Result: all-mpnet-base-v2, 0.3874323593073593, 0.05
Time of execution: 28.344942569732666
Model: all-mpnet-base-v2, iteration: 2.
0.01
0.04
0.05
Result: all-mpnet-base-v2, 0.3874323593073593, 0.05
Time of execution: 29.30491542816162
Model: all-MiniLM-L6-v2, iteration: 0.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.01
0.05
Result: all-MiniLM-L6-v2, 0.3213023088023088, 0.05
Time of execution: 21.20091414451599
Model: all-MiniLM-L6-v2, iteration: 1.
0.01
0.05
Result: all-MiniLM-L6-v2, 0.3213023088023088, 0.05
Time of execution: 17.507575273513794
Model: all-MiniLM-L6-v2, iteration: 2.
0.01
0.05
Result: all-MiniLM-L6-v2, 0.3213023088023088, 0.05
Time of execution: 17.62290668487549


In [10]:
# chosen cleaned by me

model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset='/source_repository/chosen_articles_cleaned_by_me.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories=find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                        list_of_keywords_all_categories, dataset_name='chosen_articles_cleaned_by_me',
                                        list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                        path_to_save_results='/source_repository/',
                                        path_to_gt='articles_categories_my_gt_2.csv',
                                        )
    print(f"Time of execution: {time.time()-start_time}")

Model: all-distilroberta-v1, iteration: 0.
0.01
Result: all-distilroberta-v1, 0.2727317821067821, 0.01
Time of execution: 22.32935905456543
Model: all-distilroberta-v1, iteration: 1.
0.01
Result: all-distilroberta-v1, 0.2727317821067821, 0.01
Time of execution: 21.96389412879944
Model: all-distilroberta-v1, iteration: 2.
0.01
Result: all-distilroberta-v1, 0.2727317821067821, 0.01
Time of execution: 21.604864358901978
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.
0.01
0.02
Result: multi-qa-mpnet-base-dot-v1, 0.35110930735930734, 0.02
Time of execution: 31.68803095817566
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
0.01
0.02
Result: multi-qa-mpnet-base-dot-v1, 0.35110930735930734, 0.02
Time of execution: 31.642528772354126
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
0.01
0.02
Result: multi-qa-mpnet-base-dot-v1, 0.35110930735930734, 0.02
Time of execution: 31.109609842300415
Model: all-mpnet-base-v2, iteration: 0.
0.01
Result: all-mpnet-base-v2, 0.37909902597402595, 0.01
Time 

In [11]:
# chosen cleaned regex

model_3='all-MiniLM-L6-v2'
model_2='all-mpnet-base-v2'
model_1='multi-qa-mpnet-base-dot-v1'
model_0='all-distilroberta-v1'
model_list=[model_0, model_1, model_2, model_3]
path_to_dataset='/source_repository/chosen_articles_cleaned_regex.txt'

for model_name in model_list:
  for iter in range(3):
    print(f"Model: {model_name}, iteration: {iter}.")
    start_time=time.time()
    model_name, best_balanced, best_threshold, df_scores_with_names, similarities_for_categories=find_best_score_for_given_model(model_name, path_to_dataset, iter,
                                        list_of_keywords_all_categories, dataset_name='chosen_articles_cleaned_regex',
                                        list_of_names=['political_and_religious_views', 'animal_care', 'business_involvement', 'corporate_transparency', 'environment', 'human_employee_rights', 'diversity_equity_inclusion'],
                                        path_to_save_results='/source_repository/',
                                        path_to_gt='articles_categories_my_gt_2.csv',
                                        )
    print(f"Time of execution: {time.time()-start_time}")

Model: all-distilroberta-v1, iteration: 0.
0.01
0.02
0.05
Result: all-distilroberta-v1, 0.29906655844155844, 0.05
Time of execution: 23.032732009887695
Model: all-distilroberta-v1, iteration: 1.
0.01
0.02
0.05
Result: all-distilroberta-v1, 0.29906655844155844, 0.05
Time of execution: 21.70069670677185
Model: all-distilroberta-v1, iteration: 2.
0.01
0.02
0.05
Result: all-distilroberta-v1, 0.29906655844155844, 0.05
Time of execution: 23.51605248451233
Model: multi-qa-mpnet-base-dot-v1, iteration: 0.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.3637581168831169, 0.03
Time of execution: 31.64678406715393
Model: multi-qa-mpnet-base-dot-v1, iteration: 1.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.3637581168831169, 0.03
Time of execution: 31.743035316467285
Model: multi-qa-mpnet-base-dot-v1, iteration: 2.
0.01
0.02
0.03
Result: multi-qa-mpnet-base-dot-v1, 0.3637581168831169, 0.03
Time of execution: 32.096848249435425
Model: all-mpnet-base-v2, iteration: 0.
0.01
Result: all-m

In [12]:
df_scores_with_names

Unnamed: 0,political_and_religious_views,animal_care,business_involvement,corporate_transparency,environment,human_employee_rights,diversity_equity_inclusion,final_predictions
0,0.401775,0.415385,0.480775,0.369926,0.336862,0.728989,0.573706,human_employee_rights
1,0.245177,0.419855,0.456777,0.295328,0.341165,0.420632,0.416130,business_involvement
2,0.437144,0.405814,0.386792,0.450539,0.298972,0.589726,0.498862,human_employee_rights
3,0.714645,0.478279,0.475807,0.506222,0.574976,0.577940,0.649740,political_and_religious_views
4,0.452720,0.778988,0.827151,0.532973,0.599198,0.716796,0.658212,business_involvement
...,...,...,...,...,...,...,...,...
95,0.420488,0.403080,0.400790,0.390307,0.361050,0.316422,0.338499,other
96,0.132912,0.341683,0.615740,0.110586,0.198083,0.271530,0.319864,business_involvement
97,0.512845,0.546146,0.526181,0.468947,0.720806,0.495090,0.471015,environment
98,0.406666,0.462341,0.481220,0.424651,0.314466,0.522984,0.371589,human_employee_rights
