B.L.
This script will score your list of words by giving them a domain-specificity score based on the reference set we crafted manually

In [1]:
import pandas as pd
import stanza
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

#Stanza pipeline
stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma', use_gpu=torch.cuda.is_available())

#loading ConfliBERT
tokenizer = AutoTokenizer.from_pretrained("snowood1/ConfliBERT-cont-uncased")
model = AutoModel.from_pretrained("snowood1/ConfliBERT-cont-uncased").eval()
torch.set_grad_enabled(False)

#loading the Excel file where in the first column is the reference set (in the Github), 
#in the second column will be the list of words from your dataset (you will have this list ready after using the script:
#"Create resouces, automatic annotation for domain,rare and comparison.ipynb", which is included in the Github repo.

file_path = r"List of domain specific (reference set and the extended one).xlsx"
df = pd.read_excel(file_path)

#extract columns
reference_terms = df.iloc[:, 0].dropna().unique().tolist()
domain_specific_terms = df.iloc[:, 1].tolist()

# Compute embeddings for reference terms (efficient batching)
def get_embeddings(terms):
    embeddings = []
    batch_size = 32
    for i in tqdm(range(0, len(terms), batch_size), desc="Encoding terms"):
        batch = terms[i:i + batch_size]
        encoded = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        output = model(**encoded)
        masked = output.last_hidden_state * encoded['attention_mask'].unsqueeze(-1)
        embeddings_batch = (masked.sum(dim=1) / encoded['attention_mask'].sum(dim=1, keepdim=True))
        embeddings_batch = embeddings_batch / embeddings_batch.norm(dim=1, keepdim=True)
        embeddings.append(embeddings_batch)
    return torch.cat(embeddings)

ref_embeddings = get_embeddings(reference_terms)

#similarity function using stanza lemmatization
def calculate_similarity(term):
    if pd.isna(term) or not isinstance(term, str) or term.strip() == "":
        return 0.0
    
    #lemmatize the term using stanza
    doc = nlp(term)
    lemmas = [word.lemma.lower() for sent in doc.sentences for word in sent.words if word.lemma.isalpha()]
    if not lemmas:
        return 0.0
    
    lemma_embeddings = []
    for lemma in lemmas:
        encoded = tokenizer(lemma, return_tensors="pt")
        output = model(**encoded)
        lemma_embedding = output.last_hidden_state.mean(dim=1)
        lemma_embedding = lemma_embedding / lemma_embedding.norm(p=2, dim=1, keepdim=True)
        lemma_embeddings.append(lemma_embedding[0])

    lemma_embeddings = torch.stack(lemma_embeddings)
    cosine_similarities = torch.mm(lemma_embeddings, ref_embeddings.T)
    max_sims = cosine_similarities.max(dim=1).values
    return round(max_sims.mean().item(), 3)

#apply similarity calculation and store in column 'c'
tqdm.pandas(desc="Calculating similarity scores")
df['c'] = df.iloc[:, 1].progress_apply(calculate_similarity)

#save updated Excel file
output_file_path = r"domain_terms_similarity_scored.xlsx"
df.to_excel(output_file_path, index=False)

print(f"Similarity calculation complete. Results saved to {output_file_path}.")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-27 19:16:09 INFO: Downloaded file to C:\Users\brike\stanza_resources\resources.json
2025-05-27 19:16:09 INFO: Downloading default packages for language: en (English) ...
2025-05-27 19:16:11 INFO: File exists: C:\Users\brike\stanza_resources\en\default.zip
2025-05-27 19:16:14 INFO: Finished downloading models and saved to C:\Users\brike\stanza_resources
2025-05-27 19:16:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-27 19:16:15 INFO: Downloaded file to C:\Users\brike\stanza_resources\resources.json
2025-05-27 19:16:15 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

2025-05-27 19:16:15 INFO: Using device: cpu
2025-05-27 19:16:15 INFO: Loading: tokenize
2025-05-27 19:16:15 INFO: Loading: mwt
2025-05-27 19:16:15 INFO: Loading: lemma
2025-05-27 19:16:16 INFO: Done loading processors!
Some weights of BertModel were not initialized from the model checkpoint at snowood1/ConfliBERT-cont-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding terms:   0%|                                                                           | 0/19 [00:00<?, ?it/s]Asking to truncate to max_length

Similarity calculation complete. Results saved to C:\Users\brike\OneDrive\Desktop\RA\Upload projekti\domain_terms_similarity_scored.xlsx.
