In [1]:

#from anaconda terminal: pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz

In [2]:
import spacy
import re
from scispacy.abbreviation import AbbreviationDetector
import pandas as pd
import json
from sklearn.model_selection import train_test_split

## Data Extraction

In [3]:
data = ""
with open('data/DoaScoring.json', encoding='utf-8-sig') as fh:
    data = json.load(fh)
data_df = pd.DataFrame(data)

X, X_test = train_test_split(data_df, test_size=0.1, random_state=42)
X_train, X_val = train_test_split(X, test_size=0.2, random_state=42)
print("test",X_test.shape)
print("train",X_train.shape)
print("val",X_val.shape)

test (39, 49)
train (276, 49)
val (70, 49)


# Functions

Given a list, it will remove the duplicates from it.

In [4]:
def unique(item_list):
    unique_list = []
    for item in item_list:
        if item.strip() not in unique_list:
            unique_list.append(item)
    return unique_list

Applies a regular expression to the input text to match those words that can potentially be acronyms

In [5]:
def acronym_pattern(text):
    acronym_pattern = r'\b(?=(?:[a-z\d-]*[A-Z]){2})[A-Za-z\d]+(?:-[A-Za-z\d]+)*\b'
    matches = re.findall(acronym_pattern, text)
    return matches

Uses custom logic to identify if a string is an acronym. If it is an acronym, it gets stored in a list and returned

In [6]:
def acronym_catcher (acronynm_unique_list):
    upper_case_pattern = r'[A-Z]'
    remove_plural_pattern = r's$'
    filtered_acronym_unique_list = []
    for word in acronynm_unique_list:
        compounds_list = word.split("-")
        word_len = len(word)
        compounds_num = len(compounds_list)
        upper_case = re.findall(upper_case_pattern,word)
        upper_case_num = (len(upper_case))
        if(upper_case):
            if(((word_len-(compounds_num-1))/upper_case_num) <3):
                acronym = word
                if(re.search(remove_plural_pattern,word)):
                    acronym=word.strip()[:-1]
                filtered_acronym_unique_list.append(acronym)
    return filtered_acronym_unique_list

Finds acronyms in the input text that have an expanded form in it and stores it in a list.

In [7]:
def acronym_expansion_detector(text):
    nlp = spacy.load( 'en_core_sci_sm')
    nlp.add_pipe("abbreviation_detector")
    doc = nlp( text )
    abrvs = []
    for abrv in doc._.abbreviations:
        abrvs.append(str(abrv))
    return abrvs

Remove those acronyms that don't get categorized by Spacy NER because most common acronyms don't get recognized. It is a way to remove common acronyms dynamically.

In [8]:
def spacy_ner(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    ner_string = ""
    for ent in doc.ents:
        ner_string +=f" {ent.text}"
        #print(ent.label_)
    return ner_string

Import all currency acronyms list and use it to weed out currency acronyms as they count as common acronyms. https://www.dst.dk/en/Statistik/dokumentation/nomenklaturer/valuta-iso

In [9]:
def currency_excluder (acronyms):
    currency_list = pd.read_csv("data/iso-currency.csv",sep=";")
    acronyms_final_list = []
    chars_pattern = r'[A-Z]*'
    for acronym in acronyms:
        #acr_list = re.findall(chars_pattern,acronym.upper())
        if acronym not in list(currency_list["CODE"]):
            acronyms_final_list.append(acronym)
    return acronyms_final_list

In [10]:

def acronym_detector(id,text):
    print(id)
    matches = acronym_pattern(text)
    acronynm_unique_list = unique(matches)
    filtered_acronym_unique_list = acronym_catcher(acronynm_unique_list)
    filtered_acronym_unique_list = unique(filtered_acronym_unique_list)
    
    print("acronyms found")
    print(filtered_acronym_unique_list)
    listToStr = ' '.join(map(str, acronynm_unique_list))
    
    abrvs = acronym_expansion_detector(text)
    unique_expanded_acr = unique(abrvs)
    ner_string = spacy_ner(text)
    remove_common_acronym_list = filtered_acronym_unique_list.copy()
    for acronym in filtered_acronym_unique_list:
        if(acronym not in ner_string):
            remove_common_acronym_list.remove(acronym)
    #final_acronym_list = currency_excluder(remove_common_acronym_list)
    final_acronym_list=remove_common_acronym_list
    print("acronyms parsed")
    print(final_acronym_list)
    print("acronyms with expanded form")
    print(unique_expanded_acr)
    print("---")
    return {"opportunity_id":id,"acronyms":final_acronym_list,"expanded_acronyms":unique_expanded_acr}

In [11]:
acronyms_processed = []
data = ""
with open('data/DoaScoring-AcronymDetectionData.json', encoding='utf-8-sig') as fh:
    data = json.load(fh)
data_df = pd.DataFrame(data)
for index, doa in data_df.iterrows():
    acronyms_processed.append(acronym_detector(doa["opportunity_id"],f'''{doa["task_description"]} {doa["organization_mission"]} {doa["context"]} {doa["living_conditions"]} {doa["required_skill_experience"]}'''))

1720911047380224
acronyms found
['UN', 'AoR', 'UNICEF', 'CP', 'HRP', 'PIN', 'IMO', 'GBV', 'IM', 'CPIMS', 'AOR', 'KM', 'IDP', 'COVID-19', 'NGO']


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


acronyms parsed
['UN', 'AoR', 'UNICEF', 'CP', 'HRP', 'PIN', 'GBV', 'IM', 'CPIMS', 'AOR', 'KM', 'COVID-19', 'NGO']
acronyms with expanded form
['IDP']
---
1724754261095680
acronyms found
['UNV', 'UN', 'UNICEF', 'HQ', 'MOLISA', 'HIV', 'SDG', 'NGO']


  global_matches = self.global_matcher(doc)


acronyms parsed
['UNV', 'UN', 'UNICEF', 'HQ']
acronyms with expanded form
[]
---
1715975314313472
acronyms found
['UN', 'IPRCC', 'UNDP', 'UNV', 'IVD', 'COVID-19', 'SDG', 'SDG1', 'SDG5', 'VLA', '490RMB', 'NUNV']
acronyms parsed
['UN', 'IPRCC', 'UNDP', 'UNV', 'IVD', 'COVID-19', 'SDG', 'SDG1', 'SDG5', '490RMB', 'NUNV']
acronyms with expanded form
['UNDP', 'IPRCC', 'SDGs', 'VLA']
---
1715556939709696
acronyms found
['UN', 'ECM', 'CP', 'UNV', 'IVD', 'UNICEF', 'ICO', 'AGD', 'POCSO', 'CSO', 'VLA', 'INR', 'USD']
acronyms parsed
['UN', 'ECM', 'UNV', 'IVD', 'UNICEF', 'ICO', 'INR', 'USD']
acronyms with expanded form
['AGD', 'POCSO', 'VLA']
---
1742156550684928
acronyms found
['UNV', 'UNICEF', 'UNICEF-owned', 'ROSA', 'UN', 'AGD', 'HIV', 'AIDS', 'IT']
acronyms parsed
['UNV', 'UNICEF', 'ROSA', 'UN']
acronyms with expanded form
['AGD', 'ROSA']
---
1726265068252416
acronyms found
['KEY', 'END-RESULTS', 'EXPECTED', 'ADB', 'SP', 'WB', 'UN', 'UNV', 'UNICEF', 'COVID-19', 'ARTF', 'USD']
acronyms parsed
['W

## Check the result against the validation data

Import the file

In [12]:
val_data = ""
with open('data/DoaScoring-AcronymDetectionLabels.json', encoding='utf-8-sig') as fh:
    val_data = json.load(fh)

extract the data needed for the validation

In [13]:
val_acronyms = []
chatgpt_acronyms = []
for doa in val_data:
    final_acronyms_list = doa["acronyms"].copy()
    chatgpt_acronyms = list(doa["chatgpt"].keys())
    for ignore in doa["ignoreList"]:
        if ignore in doa["acronyms"]:
            final_acronyms_list.remove(ignore)
    val_acronyms.append({"opportunity_id": doa["opportunity_id"],"acronyms":final_acronyms_list,"chatgpt":chatgpt_acronyms,"expanded_form":doa["expanded form"]})
#print(val_acronyms)

compare the results from the process with the validation data to get the accuracy and other metrics

In [33]:
code_acc = 0
chatgpt_acc = 0
code_false_positive = 0
chatgpt_false_positive = 0
total_acronyms = 0
total_expanded_forms = 0 
expanded_form_acc = 0

for doa in val_acronyms:
    total_acronyms += len(doa["acronyms"])
for index,doa in enumerate(val_acronyms):
    print(doa["opportunity_id"])
    total_acronyms_ar_mean = len(doa["acronyms"])
    acronyms_found = 0
    acronyms_found_chatgpt = 0
    acronyms_not_found = 0
    acronyms_not_found_chatgpt = 0
    print("validation acronyms",doa["acronyms"])
    doa_acronym_list = list(map(lambda x: x.upper(),  doa["acronyms"]))
    doa_acronym_expanded_list = list(map(lambda x: list(x.keys())[0],  doa["expanded_form"]))
    total_expanded_forms += len(doa_acronym_expanded_list)
    for doa_processed in acronyms_processed:
        if doa_processed["opportunity_id"] == doa["opportunity_id"]:
            print("processed acronyms",doa_processed["acronyms"])
            for acronym in doa_processed["acronyms"]:
                if acronym.upper() in doa_acronym_list:
                    acronyms_found +=1
                else:
                    acronyms_not_found +=1
            for expanded_acronym in doa_processed["expanded_acronyms"]:
                for expanded_validation in doa_acronym_expanded_list:
                    if expanded_acronym == expanded_validation.upper():
                        expanded_form_acc += 1
    for acronym in val_acronyms[index]["chatgpt"]:
        if acronym.upper() in doa_acronym_list:
            acronyms_found_chatgpt +=1
        else: 
            acronyms_not_found_chatgpt +=1
    code_acc += ((acronyms_found/total_acronyms)*100)
    chatgpt_acc +=((acronyms_found_chatgpt/total_acronyms)*100)
    code_false_positive += ((acronyms_not_found/total_acronyms)*100)
    chatgpt_false_positive +=((acronyms_not_found_chatgpt/total_acronyms)*100)
    
    print(f"code arithmetic mean acc: {(acronyms_found/total_acronyms_ar_mean)*100}%")
    print(f"chatgpt arithmetic mean acc: {(acronyms_found_chatgpt/total_acronyms_ar_mean)*100}%")
    print(f"code arithmetic mean false positive: {(acronyms_not_found/total_acronyms_ar_mean)*100}%")
    print(f"chatgpt arithmetic mean false positive: {(acronyms_not_found_chatgpt/total_acronyms_ar_mean)*100}%")
    print("-----")
print(total_expanded_forms)
print(f"weighted arithmetic code accuracy {code_acc}%")
print(f"weighted arithmetic chatgpt accuracy {chatgpt_acc}%")
print(f"weighted arithmetic code false postives {code_false_positive}%")
print(f"weighted arithmetic chatgpt false positives {chatgpt_false_positive}%")
print(f"wighted arithmetic acronym expanded form detection accuracy {(expanded_form_acc/total_expanded_forms)*100}%")

1720911047380224
validation acronyms ['AoR', 'UNICEF', 'CP', 'HRP', 'PIN', 'IMO', 'UNICEF', 'GBV', 'IM', 'CPIMS+', '5Ws', 'IM/KM', 'IDP']
processed acronyms ['UN', 'AoR', 'UNICEF', 'CP', 'HRP', 'PIN', 'GBV', 'IM', 'CPIMS', 'AOR', 'KM', 'COVID-19', 'NGO']
code arithmetic mean acc: 61.53846153846154%
chatgpt arithmetic mean acc: 69.23076923076923%
code arithmetic mean false positive: 38.46153846153847%
chatgpt arithmetic mean false positive: 7.6923076923076925%
-----
1724754261095680
validation acronyms ['UNV', 'UNICEF', 'HQ', 'MOLISA']
processed acronyms ['UNV', 'UN', 'UNICEF', 'HQ']
code arithmetic mean acc: 75.0%
chatgpt arithmetic mean acc: 50.0%
code arithmetic mean false positive: 25.0%
chatgpt arithmetic mean false positive: 50.0%
-----
1715975314313472
validation acronyms ['IPRCC', 'UNDP', 'UNV', 'IVD', 'SDG', '(I)NGO', 'SDG1', 'SDG5', 'VLA']
processed acronyms ['UN', 'IPRCC', 'UNDP', 'UNV', 'IVD', 'COVID-19', 'SDG', 'SDG1', 'SDG5', '490RMB', 'NUNV']
code arithmetic mean acc: 77.