In [1]:
import pandas as pd
from TELF.pre_processing.Vulture import Vulture
from TELF.pre_processing.Vulture.modules import SimpleCleaner
from TELF.pre_processing.Vulture.modules import LemmatizeCleaner
from TELF.pre_processing.Vulture.modules import SubstitutionCleaner
from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner
from TELF.pre_processing.Vulture.default_stop_words import  STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES

from TELF.pre_processing.Vulture.tokens_analysis.acronyms import find_acronyms

In [2]:
sample_text = """In the fast-paced modern world, acronyms serve as concise references to complex phrases, 
spanning various domains and industries. Let's delve into a collection of these abbreviations, 
highlighting their diverse applications. Starting with geography and sovereignty, the "United States (US)" is commonly abbreviated, 
representing a 3-gram. In the intricate world of computing and electronics, we encounter the "Central Processing Unit (CPU)," 
an essential 4-gram, alongside the "Random Access Memory (RAM)" and the "Global Positioning System (GPS)," each a 4-gram as well. 
The realm of science gives us the "Light Amplification by Stimulated Emission of Radiation (LASER)," a detailed 8-gram explaining a technology 
used in countless applications. Diving enthusiasts might recognize the "Self-Contained Underwater Breathing Apparatus (SCUBA)," 
a 6-gram describing essential diving gear. Navigation and detection technology are encapsulated in "Radio Detection And Ranging (RADAR)," 
a concise 5-gram. The digital marketing sphere often references "Search Engine Optimization (SEO)," a 4-gram critical for enhancing online visibility. 
Everyday financial transactions might involve an "Automated Teller Machine (ATM)," also a 4-gram, which has simplified banking operations. 
The building block of life is encoded in "Deoxyribonucleic Acid (DNA)," a 3-gram fundamental to genetics. 
Communication technologies have been revolutionized by "Voice over Internet Protocol (VoIP)," a 5-gram facilitating voice communication over the internet.
In the medical field, "Magnetic Resonance Imaging (MRI)" is a 4-gram that provides unparalleled insights into the human body's interior. 
The "Light Emitting Diode (LED)" technology, a 4-gram, is widely adopted for its efficiency in lighting.
The gaming and graphic design industries heavily rely on the "Graphics Processing Unit (GPU)," 
a 4-gram critical for rendering high-quality images. Lastly, the development of dynamic web applications is often 
powered by "Asynchronous JavaScript and XML (AJAX)," a 5-gram that has significantly enhanced web interactivity.
"""

In [3]:
text_list_varying = [
    "The Central Processing Unit (CPU) is the brain of any computer, tasked with executing commands from software applications.",
    "Every CPU, or Central Processing Unit, has a clock speed, measured in GHz, which influences how fast it can process information.",
    "In modern computing, the efficiency of a Central Processing Unit (CPU) , the Central Processing Unit, significantly impacts overall system performance.",
    "Random Access Memory (RAM) provides the necessary workspace for your computer's CPU to process data, affecting speed and multitasking capabilities.",
    "Upgrading the RAM, or Random Access Memory, can lead to noticeable improvements in a computer's responsiveness and ability to run multiple applications simultaneously.",
    "The Global Positioning System (GPS) allows devices to determine their exact location on Earth, crucial for navigation and mapping services.",
    "Light Emitting Diodes (LEDs) are widely used in displays, signage, and general lighting due to their energy efficiency and longevity.",
    "Compared to traditional bulbs, LEDs, or Light Emitting Diodes, consume significantly less electricity, making them a cost-effective lighting solution.",
    "The versatility of LED technology, or Light Emitting Diodes, has seen its application extend from simple indicators to full-scale digital displays.",
    "Innovations in LED, Light Emitting Diode technology, continue to push the boundaries of what is possible with lighting, impacting various industries.",
    "Deoxyribonucleic Acid (DNA) holds the genetic blueprint for the development, functioning, and reproduction of living organisms.",
    "Recent advancements in DNA sequencing technology have revolutionized our understanding of genetics, enabling detailed analysis of the Deoxyribonucleic Acid in various species.",
    sample_text
]
df = pd.DataFrame(text_list_varying, columns=['Text'])

In [4]:
vulture = Vulture(n_jobs=-1, verbose=True)
# dataframe_clean_args = {
#     "df": df,
#     "steps": vulture.DEFAULT_PIPELINE,
#     "substitutions": None,
#     "columns": ['Text'],
#     "append_to_original_df": True,
#     "concat_cleaned_cols": True,
# }
# df = vulture.clean_dataframe(**dataframe_clean_args) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    13 non-null     object
dtypes: object(1)
memory usage: 236.0+ bytes


In [5]:
from TELF.pre_processing.Vulture.modules import AcronymCleaner


In [6]:
documents = {i: text for i,text in enumerate(text_list_varying)}

steps = [
    RemoveNonEnglishCleaner(ascii_ratio=0.9, stopwords_ratio=0.25),
    SimpleCleaner(stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
                  order = [
                      'standardize_hyphens',
                      'isolate_frozen',
                      'remove_copyright_statement',
                      'remove_stop_phrases',
                      'make_lower_case',
                      'remove_formulas',
                      'normalize',
                      'remove_next_line',
                      'remove_email',
                      'remove_()',
                      'remove_[]',
                      'remove_special_characters',
                      'remove_nonASCII_boundary',
                      'remove_nonASCII',
                      'remove_tags',
                      'remove_stop_words',
                      'remove_standalone_numbers',
                      'remove_extra_whitespace',
                      'min_characters',
                  ]
                ),
    LemmatizeCleaner('spacy'),
]
cleaned_documents = vulture.clean(documents, steps=steps)

steps_2 = [
    AcronymCleaner(merge_sim_threshold=0.9, verbose=True),
]
acronyms = vulture.clean(cleaned_documents, steps=steps_2)

# steps_3 = [
#     sub = SubstitutionCleaner(acronyms, lemmatize=False)(),
# ]
# acronyms = vulture.clean(cleaned_documents, steps=steps)
 


[Vulture]: Cleaning 13 documents
  0%|          | 0/3 [00:00<?, ?it/s][Vulture]: Running RemoveNonEnglishCleaner module
[Parallel(n_jobs=12)]: Using backend MultiprocessingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   4 out of  13 | elapsed:   25.8s remaining:   58.0s
[Parallel(n_jobs=12)]: Done  13 out of  13 | elapsed:  1.2min finished
 33%|███▎      | 1/3 [01:11<02:22, 71.35s/it][Vulture]: Running SimpleCleaner module
[Parallel(n_jobs=12)]: Using backend MultiprocessingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   4 out of  13 | elapsed:   30.2s remaining:  1.1min
[Parallel(n_jobs=12)]: Done  13 out of  13 | elapsed:  1.3min finished
 67%|██████▋   | 2/3 [02:31<01:16, 76.77s/it][Vulture]: Running LemmatizeCleaner module
[Parallel(n_jobs=12)]: Using backend MultiprocessingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   4 out of  13 | elapsed:   32.7s remaining:  1.2min
[Parallel(n_jobs=12)]: Done  13 out of  13 | elapsed:  1.4

run sub starts:  (0, 'central processing unit cpu computer task execute command software')
run sub starts:  {0: 'central processing unit cpu computer task execute command software'}
single sub else:  0
before words documents {0: 'central processing unit cpu computer task execute command software'}
before words doc_id 0


100%|██████████| 1/1 [00:00<00:00, 11096.04it/s]
100%|██████████| 1/1 [00:00<00:00, 15650.39it/s]
100%|██████████| 1/1 [00:00<00:00, 18558.87it/s]
100%|██████████| 1/1 [00:00<00:00, 18477.11it/s]
100%|██████████| 1/1 [00:00<00:00, 23301.69it/s]
100%|██████████| 1/1 [00:00<00:00, 24244.53it/s]
100%|██████████| 1/1 [00:00<00:00, 19239.93it/s]


run sub starts:  (1, 'cpu central processing unit clock speed ghz influence information')
run sub starts:  {1: 'cpu central processing unit clock speed ghz influence information'}
single sub else:  1
empty 2


100%|██████████| 1/1 [00:00<00:00, 20068.44it/s]
100%|██████████| 1/1 [00:00<00:00, 33554.43it/s]
100%|██████████| 1/1 [00:00<00:00, 38130.04it/s]
100%|██████████| 1/1 [00:00<00:00, 30393.51it/s]
100%|██████████| 1/1 [00:00<00:00, 37449.14it/s]
100%|██████████| 1/1 [00:00<00:00, 41527.76it/s]
100%|██████████| 1/1 [00:00<00:00, 43240.25it/s]


run sub starts:  (2, 'modern computing efficiency central processing unit cpu central processing unit impact')
run sub starts:  {2: 'modern computing efficiency central processing unit cpu central processing unit impact'}
single sub else:  2
before words documents {2: 'modern computing efficiency central processing unit cpu central processing unit impact'}
before words doc_id 2


100%|██████████| 1/1 [00:00<00:00, 19691.57it/s]
100%|██████████| 1/1 [00:00<00:00, 30393.51it/s]
100%|██████████| 1/1 [00:00<00:00, 32263.88it/s]
100%|██████████| 1/1 [00:00<00:00, 33288.13it/s]
100%|██████████| 1/1 [00:00<00:00, 41527.76it/s]
100%|██████████| 1/1 [00:00<00:00, 43240.25it/s]
100%|██████████| 1/1 [00:00<00:00, 47662.55it/s]


run sub starts:  (3, '')
run sub starts:  {3: ''}
single sub else:  3
empty 2


100%|██████████| 1/1 [00:00<00:00, 25731.93it/s]
100%|██████████| 1/1 [00:00<00:00, 52428.80it/s]
100%|██████████| 1/1 [00:00<00:00, 49932.19it/s]
100%|██████████| 1/1 [00:00<00:00, 45590.26it/s]
100%|██████████| 1/1 [00:00<00:00, 59074.70it/s]
100%|██████████| 1/1 [00:00<00:00, 61680.94it/s]
100%|██████████| 1/1 [00:00<00:00, 65536.00it/s]
[Parallel(n_jobs=12)]: Done   4 out of  13 | elapsed:   31.3s remaining:  1.2min


run sub starts:  (4, 'upgrade ram random access memory noticeable improvement computer responsiveness ability multiple simultaneously')
run sub starts:  {4: 'upgrade ram random access memory noticeable improvement computer responsiveness ability multiple simultaneously'}
single sub else:  4
empty 2


100%|██████████| 1/1 [00:00<00:00, 20867.18it/s]
100%|██████████| 1/1 [00:00<00:00, 30393.51it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 34379.54it/s]
100%|██████████| 1/1 [00:00<00:00, 41943.04it/s]
100%|██████████| 1/1 [00:00<00:00, 43240.25it/s]
100%|██████████| 1/1 [00:00<00:00, 47662.55it/s]


run sub starts:  (5, 'global positioning gps device exact location earth crucial navigation mapping service')
run sub starts:  {5: 'global positioning gps device exact location earth crucial navigation mapping service'}
single sub else:  5
empty 2


100%|██████████| 1/1 [00:00<00:00, 20460.02it/s]
100%|██████████| 1/1 [00:00<00:00, 31300.78it/s]
100%|██████████| 1/1 [00:00<00:00, 37117.73it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 41943.04it/s]
100%|██████████| 1/1 [00:00<00:00, 43690.67it/s]
100%|██████████| 1/1 [00:00<00:00, 47662.55it/s]


run sub starts:  (6, 'emit diode led display signage lighting efficiency longevity')
run sub starts:  {6: 'emit diode led display signage lighting efficiency longevity'}
single sub else:  6
empty 2


100%|██████████| 1/1 [00:00<00:00, 19239.93it/s]
100%|██████████| 1/1 [00:00<00:00, 32017.59it/s]
100%|██████████| 1/1 [00:00<00:00, 38130.04it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 45590.26it/s]
100%|██████████| 1/1 [00:00<00:00, 50533.78it/s]
100%|██████████| 1/1 [00:00<00:00, 52428.80it/s]


run sub starts:  (7, '')
run sub starts:  {7: ''}
single sub else:  7
empty 2


100%|██████████| 1/1 [00:00<00:00, 15650.39it/s]
100%|██████████| 1/1 [00:00<00:00, 29330.80it/s]
100%|██████████| 1/1 [00:00<00:00, 53092.46it/s]
100%|██████████| 1/1 [00:00<00:00, 43690.67it/s]
100%|██████████| 1/1 [00:00<00:00, 62601.55it/s]
100%|██████████| 1/1 [00:00<00:00, 71089.90it/s]
100%|██████████| 1/1 [00:00<00:00, 67650.06it/s]


run sub starts:  (8, 'versatility lead technology emit diode extend indicator digital display')
run sub starts:  {8: 'versatility lead technology emit diode extend indicator digital display'}
single sub else:  8
empty 2


100%|██████████| 1/1 [00:00<00:00, 21399.51it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 36792.14it/s]
100%|██████████| 1/1 [00:00<00:00, 43690.67it/s]
100%|██████████| 1/1 [00:00<00:00, 47662.55it/s]
100%|██████████| 1/1 [00:00<00:00, 53092.46it/s]


run sub starts:  (9, 'innovation lead emit diode technology continue push boundary light impact industry')
run sub starts:  {9: 'innovation lead emit diode technology continue push boundary light impact industry'}
single sub else:  9
empty 2


100%|██████████| 1/1 [00:00<00:00, 18477.11it/s]
100%|██████████| 1/1 [00:00<00:00, 33554.43it/s]
100%|██████████| 1/1 [00:00<00:00, 35544.95it/s]
100%|██████████| 1/1 [00:00<00:00, 34663.67it/s]
100%|██████████| 1/1 [00:00<00:00, 41943.04it/s]
100%|██████████| 1/1 [00:00<00:00, 45590.26it/s]
100%|██████████| 1/1 [00:00<00:00, 47662.55it/s]


run sub starts:  (10, 'deoxyribonucleic acid dna hold genetic blueprint development function reproduction living organism')
run sub starts:  {10: 'deoxyribonucleic acid dna hold genetic blueprint development function reproduction living organism'}
single sub else:  10
empty 2


100%|██████████| 1/1 [00:00<00:00, 20971.52it/s]
100%|██████████| 1/1 [00:00<00:00, 30393.51it/s]
100%|██████████| 1/1 [00:00<00:00, 31300.78it/s]
100%|██████████| 1/1 [00:00<00:00, 28532.68it/s]
100%|██████████| 1/1 [00:00<00:00, 37117.73it/s]
100%|██████████| 1/1 [00:00<00:00, 41527.76it/s]
100%|██████████| 1/1 [00:00<00:00, 41527.76it/s]
100%|██████████| 1/1 [00:00<00:00, 4185.93it/s]
100%|██████████| 1/1 [00:00<00:00, 8355.19it/s]
100%|██████████| 1/1 [00:00<00:00, 8774.69it/s]
100%|██████████| 1/1 [00:00<00:00, 8943.08it/s]
100%|██████████| 1/1 [00:00<00:00, 4609.13it/s]
100%|██████████| 1/1 [00:00<00:00, 3279.36it/s]
100%|██████████| 1/1 [00:00<00:00, 4519.72it/s]
100%|██████████| 1/1 [00:00<00:00, 12985.46it/s]
100%|██████████| 1/1 [00:00<00:00, 20460.02it/s]
100%|██████████| 1/1 [00:00<00:00, 21399.51it/s]
100%|██████████| 1/1 [00:00<00:00, 31068.92it/s]
100%|██████████| 1/1 [00:00<00:00, 37117.73it/s]
100%|██████████| 1/1 [00:00<00:00, 38479.85it/s]
100%|██████████| 1/1 [00:00

run sub starts:  (11, 'advancement dna sequence technology revolutionize understand genetic enable detailed analysis deoxyribonucleic acid specie')
run sub starts:  (12, 'modern acronym serve concise complex phrase span domain industry delve collection abbreviation highlight diverse start geography sovereignty united states commonly abbreviate -gram intricate computing electronic encounter central processing unit cpu essential -gram alongside random access memory ram global positioning gps -gram realm science amplification stimulate emission radiation laser detail -gram explain technology countless diving enthusiast recognize underwater breathing apparatus scuba -gram describe essential diving gear navigation technology encapsulate radio range radar concise -gram digital marketing sphere search engine optimization seo -gram critical enhance online visibility everyday financial transaction involve automated teller machine atm -gram simplify banking operation building block life encode d




In [7]:
steps_2 = [
    AcronymCleaner(merge_sim_threshold=0.9, verbose=True),
]
acronyms = vulture.clean(cleaned_documents, steps=steps_2)

[Vulture]: Cleaning 13 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running AcronymCleaner module
[Parallel(n_jobs=12)]: Using backend MultiprocessingBackend with 12 concurrent workers.


run sub starts:  (0, 'central processing unit cpu computer task execute command software')
run sub starts:  {0: 'central processing unit cpu computer task execute command software'}
single sub else:  0
before words documents {0: 'central processing unit cpu computer task execute command software'}
before words doc_id 0


100%|██████████| 1/1 [00:00<00:00, 20867.18it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 36792.14it/s]
100%|██████████| 1/1 [00:00<00:00, 35544.95it/s]
100%|██████████| 1/1 [00:00<00:00, 43690.67it/s]
100%|██████████| 1/1 [00:00<00:00, 45100.04it/s]
100%|██████████| 1/1 [00:00<00:00, 49932.19it/s]


run sub starts:  (1, 'cpu central processing unit clock speed ghz influence information')
run sub starts:  {1: 'cpu central processing unit clock speed ghz influence information'}
single sub else:  1
empty 2


100%|██████████| 1/1 [00:00<00:00, 19972.88it/s]
100%|██████████| 1/1 [00:00<00:00, 33026.02it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 34379.54it/s]
100%|██████████| 1/1 [00:00<00:00, 43240.25it/s]
100%|██████████| 1/1 [00:00<00:00, 47662.55it/s]
100%|██████████| 1/1 [00:00<00:00, 53092.46it/s]


run sub starts:  (2, 'modern computing efficiency central processing unit cpu central processing unit impact')
run sub starts:  {2: 'modern computing efficiency central processing unit cpu central processing unit impact'}
single sub else:  2
before words documents {2: 'modern computing efficiency central processing unit cpu central processing unit impact'}
before words doc_id 2


100%|██████████| 1/1 [00:00<00:00, 22192.08it/s]
100%|██████████| 1/1 [00:00<00:00, 31300.78it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 34379.54it/s]
100%|██████████| 1/1 [00:00<00:00, 41527.76it/s]
100%|██████████| 1/1 [00:00<00:00, 43690.67it/s]
100%|██████████| 1/1 [00:00<00:00, 47662.55it/s]


run sub starts:  (3, '')
run sub starts:  {3: ''}
single sub else:  3
empty 2


100%|██████████| 1/1 [00:00<00:00, 26379.27it/s]
100%|██████████| 1/1 [00:00<00:00, 47127.01it/s]
100%|██████████| 1/1 [00:00<00:00, 52428.80it/s]
100%|██████████| 1/1 [00:00<00:00, 45100.04it/s]
100%|██████████| 1/1 [00:00<00:00, 59074.70it/s]
100%|██████████| 1/1 [00:00<00:00, 62601.55it/s]
100%|██████████| 1/1 [00:00<00:00, 71089.90it/s]
[Parallel(n_jobs=12)]: Done   4 out of  13 | elapsed:   32.0s remaining:  1.2min


run sub starts:  (4, 'upgrade ram random access memory noticeable improvement computer responsiveness ability multiple simultaneously')
run sub starts:  {4: 'upgrade ram random access memory noticeable improvement computer responsiveness ability multiple simultaneously'}
single sub else:  4
empty 2


100%|██████████| 1/1 [00:00<00:00, 17331.83it/s]
100%|██████████| 1/1 [00:00<00:00, 33554.43it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 33288.13it/s]
100%|██████████| 1/1 [00:00<00:00, 41527.76it/s]
100%|██████████| 1/1 [00:00<00:00, 43690.67it/s]
100%|██████████| 1/1 [00:00<00:00, 45100.04it/s]


run sub starts:  (5, 'global positioning gps device exact location earth crucial navigation mapping service')
run sub starts:  {5: 'global positioning gps device exact location earth crucial navigation mapping service'}
single sub else:  5
empty 2


100%|██████████| 1/1 [00:00<00:00, 20867.18it/s]
100%|██████████| 1/1 [00:00<00:00, 32263.88it/s]
100%|██████████| 1/1 [00:00<00:00, 36792.14it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]
100%|██████████| 1/1 [00:00<00:00, 41943.04it/s]
100%|██████████| 1/1 [00:00<00:00, 45590.26it/s]
100%|██████████| 1/1 [00:00<00:00, 47127.01it/s]


run sub starts:  (6, 'emit diode led display signage lighting efficiency longevity')
run sub starts:  {6: 'emit diode led display signage lighting efficiency longevity'}
single sub else:  6
empty 2


100%|██████████| 1/1 [00:00<00:00, 10894.30it/s]
100%|██████████| 1/1 [00:00<00:00, 13934.56it/s]
100%|██████████| 1/1 [00:00<00:00, 18157.16it/s]
100%|██████████| 1/1 [00:00<00:00, 18893.26it/s]
100%|██████████| 1/1 [00:00<00:00, 23172.95it/s]
100%|██████████| 1/1 [00:00<00:00, 21399.51it/s]
100%|██████████| 1/1 [00:00<00:00, 22795.13it/s]


run sub starts:  (7, '')
run sub starts:  {7: ''}
single sub else:  7
empty 2


100%|██████████| 1/1 [00:00<00:00, 27776.85it/s]
100%|██████████| 1/1 [00:00<00:00, 49932.19it/s]
100%|██████████| 1/1 [00:00<00:00, 53092.46it/s]
100%|██████████| 1/1 [00:00<00:00, 43240.25it/s]
100%|██████████| 1/1 [00:00<00:00, 59074.70it/s]
100%|██████████| 1/1 [00:00<00:00, 67650.06it/s]
100%|██████████| 1/1 [00:00<00:00, 71089.90it/s]


run sub starts:  (8, 'versatility lead technology emit diode extend indicator digital display')
run sub starts:  {8: 'versatility lead technology emit diode extend indicator digital display'}
single sub else:  8
empty 2


100%|██████████| 1/1 [00:00<00:00, 12520.31it/s]
100%|██████████| 1/1 [00:00<00:00, 14665.40it/s]
100%|██████████| 1/1 [00:00<00:00, 16320.25it/s]
100%|██████████| 1/1 [00:00<00:00, 18558.87it/s]
100%|██████████| 1/1 [00:00<00:00, 22671.91it/s]
100%|██████████| 1/1 [00:00<00:00, 24966.10it/s]
100%|██████████| 1/1 [00:00<00:00, 27060.03it/s]


run sub starts:  (9, 'innovation lead emit diode technology continue push boundary light impact industry')
run sub starts:  {9: 'innovation lead emit diode technology continue push boundary light impact industry'}
single sub else:  9
empty 2


100%|██████████| 1/1 [00:00<00:00, 11244.78it/s]
100%|██████████| 1/1 [00:00<00:00, 14665.40it/s]
100%|██████████| 1/1 [00:00<00:00, 14122.24it/s]
100%|██████████| 1/1 [00:00<00:00, 15887.52it/s]
100%|██████████| 1/1 [00:00<00:00, 19599.55it/s]
100%|██████████| 1/1 [00:00<00:00, 23172.95it/s]
100%|██████████| 1/1 [00:00<00:00, 25115.59it/s]


run sub starts:  (10, 'deoxyribonucleic acid dna hold genetic blueprint development function reproduction living organism')
run sub starts:  {10: 'deoxyribonucleic acid dna hold genetic blueprint development function reproduction living organism'}
single sub else:  10
empty 2


100%|██████████| 1/1 [00:00<00:00, 18236.10it/s]
100%|██████████| 1/1 [00:00<00:00, 28532.68it/s]
100%|██████████| 1/1 [00:00<00:00, 23696.63it/s]
100%|██████████| 1/1 [00:00<00:00, 26214.40it/s]
100%|██████████| 1/1 [00:00<00:00, 32263.88it/s]
100%|██████████| 1/1 [00:00<00:00, 33026.02it/s]
100%|██████████| 1/1 [00:00<00:00, 36792.14it/s]
100%|██████████| 1/1 [00:00<00:00, 3919.91it/s]
100%|██████████| 1/1 [00:00<00:00, 4583.94it/s]
100%|██████████| 1/1 [00:00<00:00, 4583.94it/s]
100%|██████████| 1/1 [00:00<00:00, 4760.84it/s]
100%|██████████| 1/1 [00:00<00:00, 5159.05it/s]
100%|██████████| 1/1 [00:00<00:00, 3266.59it/s]
100%|██████████| 1/1 [00:00<00:00, 10894.30it/s]
100%|██████████| 1/1 [00:00<00:00, 4485.89it/s]
100%|██████████| 1/1 [00:00<00:00, 17848.10it/s]
100%|██████████| 1/1 [00:00<00:00, 21732.15it/s]
100%|██████████| 1/1 [00:00<00:00, 21845.33it/s]
100%|██████████| 1/1 [00:00<00:00, 26379.27it/s]
100%|██████████| 1/1 [00:00<00:00, 28532.68it/s]
100%|██████████| 1/1 [00:00

run sub starts:  (11, 'advancement dna sequence technology revolutionize understand genetic enable detailed analysis deoxyribonucleic acid specie')
run sub starts:  (12, 'modern acronym serve concise complex phrase span domain industry delve collection abbreviation highlight diverse start geography sovereignty united states commonly abbreviate -gram intricate computing electronic encounter central processing unit cpu essential -gram alongside random access memory ram global positioning gps -gram realm science amplification stimulate emission radiation laser detail -gram explain technology countless diving enthusiast recognize underwater breathing apparatus scuba -gram describe essential diving gear navigation technology encapsulate radio range radar concise -gram digital marketing sphere search engine optimization seo -gram critical enhance online visibility everyday financial transaction involve automated teller machine atm -gram simplify banking operation building block life encode d




In [8]:
cleaned_documents = vulture.clean(
documents, 
steps= [AcronymCleaner(merge_sim_threshold=0.9, verbose=False)] # verbose false forced
)

[Vulture]: Cleaning 13 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running AcronymCleaner module
[Parallel(n_jobs=12)]: Using backend MultiprocessingBackend with 12 concurrent workers.


run sub starts:  (0, 'The Central Processing Unit (CPU) is the brain of any computer, tasked with executing commands from software applications.')
run sub starts:  {0: 'The Central Processing Unit (CPU) is the brain of any computer, tasked with executing commands from software applications.'}
single sub else:  0
empty 2


100%|██████████| 1/1 [00:00<00:00, 11915.64it/s]
100%|██████████| 1/1 [00:00<00:00, 27776.85it/s]
100%|██████████| 1/1 [00:00<00:00, 24385.49it/s]
100%|██████████| 1/1 [00:00<00:00, 27060.03it/s]
100%|██████████| 1/1 [00:00<00:00, 32263.88it/s]
100%|██████████| 1/1 [00:00<00:00, 37117.73it/s]
100%|██████████| 1/1 [00:00<00:00, 36792.14it/s]


run sub starts:  (1, 'Every CPU, or Central Processing Unit, has a clock speed, measured in GHz, which influences how fast it can process information.')
run sub starts:  {1: 'Every CPU, or Central Processing Unit, has a clock speed, measured in GHz, which influences how fast it can process information.'}
single sub else:  1
empty 2


100%|██████████| 1/1 [00:00<00:00, 16710.37it/s]
100%|██████████| 1/1 [00:00<00:00, 25731.93it/s]
100%|██████████| 1/1 [00:00<00:00, 30393.51it/s]
100%|██████████| 1/1 [00:00<00:00, 29330.80it/s]
100%|██████████| 1/1 [00:00<00:00, 31068.92it/s]
100%|██████████| 1/1 [00:00<00:00, 35544.95it/s]
100%|██████████| 1/1 [00:00<00:00, 37117.73it/s]


run sub starts:  (2, 'In modern computing, the efficiency of a Central Processing Unit (CPU) , the Central Processing Unit, significantly impacts overall system performance.')
run sub starts:  {2: 'In modern computing, the efficiency of a Central Processing Unit (CPU) , the Central Processing Unit, significantly impacts overall system performance.'}
single sub else:  2
empty 2


100%|██████████| 1/1 [00:00<00:00, 17848.10it/s]
100%|██████████| 1/1 [00:00<00:00, 24385.49it/s]
100%|██████████| 1/1 [00:00<00:00, 25731.93it/s]
100%|██████████| 1/1 [00:00<00:00, 22192.08it/s]
100%|██████████| 1/1 [00:00<00:00, 26379.27it/s]
100%|██████████| 1/1 [00:00<00:00, 28728.11it/s]
100%|██████████| 1/1 [00:00<00:00, 31300.78it/s]


run sub starts:  (3, "Random Access Memory (RAM) provides the necessary workspace for your computer's CPU to process data, affecting speed and multitasking capabilities.")
run sub starts:  {3: "Random Access Memory (RAM) provides the necessary workspace for your computer's CPU to process data, affecting speed and multitasking capabilities."}
single sub else:  3
empty 2


100%|██████████| 1/1 [00:00<00:00, 10512.04it/s]
100%|██████████| 1/1 [00:00<00:00, 25731.93it/s]
100%|██████████| 1/1 [00:00<00:00, 27776.85it/s]
100%|██████████| 1/1 [00:00<00:00, 28532.68it/s]
100%|██████████| 1/1 [00:00<00:00, 30174.85it/s]
100%|██████████| 1/1 [00:00<00:00, 34663.67it/s]
100%|██████████| 1/1 [00:00<00:00, 35544.95it/s]
[Parallel(n_jobs=12)]: Done   4 out of  13 | elapsed:  1.0min remaining:  2.3min


run sub starts:  (4, "Upgrading the RAM, or Random Access Memory, can lead to noticeable improvements in a computer's responsiveness and ability to run multiple applications simultaneously.")
run sub starts:  {4: "Upgrading the RAM, or Random Access Memory, can lead to noticeable improvements in a computer's responsiveness and ability to run multiple applications simultaneously."}
single sub else:  4
empty 2


100%|██████████| 1/1 [00:00<00:00, 15363.75it/s]
100%|██████████| 1/1 [00:00<00:00, 23831.27it/s]
100%|██████████| 1/1 [00:00<00:00, 26214.40it/s]
100%|██████████| 1/1 [00:00<00:00, 24818.37it/s]
100%|██████████| 1/1 [00:00<00:00, 27962.03it/s]
100%|██████████| 1/1 [00:00<00:00, 32263.88it/s]
100%|██████████| 1/1 [00:00<00:00, 29537.35it/s]


run sub starts:  (5, 'The Global Positioning System (GPS) allows devices to determine their exact location on Earth, crucial for navigation and mapping services.')
run sub starts:  {5: 'The Global Positioning System (GPS) allows devices to determine their exact location on Earth, crucial for navigation and mapping services.'}
single sub else:  5
empty 2


100%|██████████| 1/1 [00:00<00:00, 11748.75it/s]
100%|██████████| 1/1 [00:00<00:00, 28728.11it/s]
100%|██████████| 1/1 [00:00<00:00, 26886.56it/s]
100%|██████████| 1/1 [00:00<00:00, 26214.40it/s]
100%|██████████| 1/1 [00:00<00:00, 31300.78it/s]
100%|██████████| 1/1 [00:00<00:00, 34379.54it/s]
100%|██████████| 1/1 [00:00<00:00, 37117.73it/s]


run sub starts:  (6, 'Light Emitting Diodes (LEDs) are widely used in displays, signage, and general lighting due to their energy efficiency and longevity.')
run sub starts:  {6: 'Light Emitting Diodes (LEDs) are widely used in displays, signage, and general lighting due to their energy efficiency and longevity.'}
single sub else:  6
empty 2


100%|██████████| 1/1 [00:00<00:00, 14513.16it/s]
100%|██████████| 1/1 [00:00<00:00, 24385.49it/s]
100%|██████████| 1/1 [00:00<00:00, 30393.51it/s]
100%|██████████| 1/1 [00:00<00:00, 26886.56it/s]
100%|██████████| 1/1 [00:00<00:00, 31300.78it/s]
100%|██████████| 1/1 [00:00<00:00, 37117.73it/s]
100%|██████████| 1/1 [00:00<00:00, 38479.85it/s]


run sub starts:  (7, 'Compared to traditional bulbs, LEDs, or Light Emitting Diodes, consume significantly less electricity, making them a cost-effective lighting solution.')
run sub starts:  {7: 'Compared to traditional bulbs, LEDs, or Light Emitting Diodes, consume significantly less electricity, making them a cost-effective lighting solution.'}
single sub else:  7
empty 2


100%|██████████| 1/1 [00:00<00:00, 16644.06it/s]
100%|██████████| 1/1 [00:00<00:00, 27776.85it/s]
100%|██████████| 1/1 [00:00<00:00, 29330.80it/s]
100%|██████████| 1/1 [00:00<00:00, 30174.85it/s]
100%|██████████| 1/1 [00:00<00:00, 33288.13it/s]
100%|██████████| 1/1 [00:00<00:00, 34100.03it/s]
100%|██████████| 1/1 [00:00<00:00, 38836.15it/s]


run sub starts:  (8, 'The versatility of LED technology, or Light Emitting Diodes, has seen its application extend from simple indicators to full-scale digital displays.')
run sub starts:  {8: 'The versatility of LED technology, or Light Emitting Diodes, has seen its application extend from simple indicators to full-scale digital displays.'}
single sub else:  8
empty 2


100%|██████████| 1/1 [00:00<00:00, 11881.88it/s]
100%|██████████| 1/1 [00:00<00:00, 21732.15it/s]
100%|██████████| 1/1 [00:00<00:00, 25731.93it/s]
100%|██████████| 1/1 [00:00<00:00, 26214.40it/s]
100%|██████████| 1/1 [00:00<00:00, 32263.88it/s]
100%|██████████| 1/1 [00:00<00:00, 33288.13it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]


run sub starts:  (9, 'Innovations in LED, Light Emitting Diode technology, continue to push the boundaries of what is possible with lighting, impacting various industries.')
run sub starts:  {9: 'Innovations in LED, Light Emitting Diode technology, continue to push the boundaries of what is possible with lighting, impacting various industries.'}
single sub else:  9
empty 2


100%|██████████| 1/1 [00:00<00:00, 18315.74it/s]
100%|██████████| 1/1 [00:00<00:00, 27776.85it/s]
100%|██████████| 1/1 [00:00<00:00, 29330.80it/s]
100%|██████████| 1/1 [00:00<00:00, 27060.03it/s]
100%|██████████| 1/1 [00:00<00:00, 33288.13it/s]
100%|██████████| 1/1 [00:00<00:00, 34663.67it/s]
100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]


run sub starts:  (10, 'Deoxyribonucleic Acid (DNA) holds the genetic blueprint for the development, functioning, and reproduction of living organisms.')
run sub starts:  {10: 'Deoxyribonucleic Acid (DNA) holds the genetic blueprint for the development, functioning, and reproduction of living organisms.'}
single sub else:  10
empty 2


100%|██████████| 1/1 [00:00<00:00, 17623.13it/s]
100%|██████████| 1/1 [00:00<00:00, 32017.59it/s]
100%|██████████| 1/1 [00:00<00:00, 31068.92it/s]
100%|██████████| 1/1 [00:00<00:00, 31068.92it/s]
100%|██████████| 1/1 [00:00<00:00, 38479.85it/s]
100%|██████████| 1/1 [00:00<00:00, 39945.75it/s]
100%|██████████| 1/1 [00:00<00:00, 41943.04it/s]
100%|██████████| 1/1 [00:00<00:00, 2898.62it/s]
100%|██████████| 1/1 [00:00<00:00, 2951.66it/s]
100%|██████████| 1/1 [00:00<00:00, 2421.65it/s]
100%|██████████| 1/1 [00:00<00:00, 3379.78it/s]
100%|██████████| 1/1 [00:00<00:00, 3300.00it/s]
100%|██████████| 1/1 [00:00<00:00, 3279.36it/s]
100%|██████████| 1/1 [00:00<00:00, 3175.10it/s]
100%|██████████| 1/1 [00:00<00:00, 10106.76it/s]
100%|██████████| 1/1 [00:00<00:00, 17924.38it/s]
100%|██████████| 1/1 [00:00<00:00, 19599.55it/s]
100%|██████████| 1/1 [00:00<00:00, 18808.54it/s]
100%|██████████| 1/1 [00:00<00:00, 22795.13it/s]
100%|██████████| 1/1 [00:00<00:00, 27060.03it/s]
100%|██████████| 1/1 [00:00

run sub starts:  (11, 'Recent advancements in DNA sequencing technology have revolutionized our understanding of genetics, enabling detailed analysis of the Deoxyribonucleic Acid in various species.')
run sub starts:  (12, 'In the fast-paced modern world, acronyms serve as concise references to complex phrases, \nspanning various domains and industries. Let\'s delve into a collection of these abbreviations, \nhighlighting their diverse applications. Starting with geography and sovereignty, the "United States (US)" is commonly abbreviated, \nrepresenting a 3-gram. In the intricate world of computing and electronics, we encounter the "Central Processing Unit (CPU)," \nan essential 4-gram, alongside the "Random Access Memory (RAM)" and the "Global Positioning System (GPS)," each a 4-gram as well. \nThe realm of science gives us the "Light Amplification by Stimulated Emission of Radiation (LASER)," a detailed 8-gram explaining a technology \nused in countless applications. Diving enthusias




In [9]:
 acronyms

{0: 'central processing unit cpu computer task execute command software',
 1: 'cpu central processing unit clock speed ghz influence information',
 2: 'modern computing efficiency central processing unit cpu central processing unit impact',
 3: '',
 4: 'upgrade ram random access memory noticeable improvement computer responsiveness ability multiple simultaneously',
 5: 'global positioning gps device exact location earth crucial navigation mapping service',
 6: 'emit diode led display signage lighting efficiency longevity',
 7: '',
 8: 'versatility lead technology emit diode extend indicator digital display',
 9: 'innovation lead emit diode technology continue push boundary light impact industry',
 10: 'deoxyribonucleic acid dna hold genetic blueprint development function reproduction living organism',
 11: 'advancement dna sequence technology revolutionize understand genetic enable detailed analysis deoxyribonucleic acid specie',
 12: 'modern acronym serve concise complex phrase span d