In [5]:
import pandas as pd
from TELF.pre_processing.Vulture import Vulture
from TELF.pre_processing.Vulture.modules import SimpleCleaner
from TELF.pre_processing.Vulture.modules import LemmatizeCleaner
from TELF.pre_processing.Vulture.modules import SubstitutionCleaner
from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner
from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.tokens_analysis.acronyms import find_acronyms

In [None]:
sample_text = """In the fast-paced modern world, acronyms serve as concise references to complex phrases, 
spanning various domains and industries. Let's delve into a collection of these abbreviations, 
highlighting their diverse applications. Starting with geography and sovereignty, the "United States (US)" is commonly abbreviated, 
representing a 3-gram. In the intricate world of computing and electronics, we encounter the "Central Processing Unit (CPU)," 
an essential 4-gram, alongside the "Random Access Memory (RAM)" and the "Global Positioning System (GPS)," each a 4-gram as well. 
The realm of science gives us the "Light Amplification by Stimulated Emission of Radiation (LASER)," a detailed 8-gram explaining a technology 
used in countless applications. Diving enthusiasts might recognize the "Self-Contained Underwater Breathing Apparatus (SCUBA)," 
a 6-gram describing essential diving gear. Navigation and detection technology are encapsulated in "Radio Detection And Ranging (RADAR)," 
a concise 5-gram. The digital marketing sphere often references "Search Engine Optimization (SEO)," a 4-gram critical for enhancing online visibility. 
Everyday financial transactions might involve an "Automated Teller Machine (ATM)," also a 4-gram, which has simplified banking operations. 
The building block of life is encoded in "Deoxyribonucleic Acid (DNA)," a 3-gram fundamental to genetics. 
Communication technologies have been revolutionized by "Voice over Internet Protocol (VoIP)," a 5-gram facilitating voice communication over the internet.
In the medical field, "Magnetic Resonance Imaging (MRI)" is a 4-gram that provides unparalleled insights into the human body's interior. 
The "Light Emitting Diode (LED)" technology, a 4-gram, is widely adopted for its efficiency in lighting.
The gaming and graphic design industries heavily rely on the "Graphics Processing Unit (GPU)," 
a 4-gram critical for rendering high-quality images. Lastly, the development of dynamic web applications is often 
powered by "Asynchronous JavaScript and XML (AJAX)," a 5-gram that has significantly enhanced web interactivity.
"""

In [7]:
text_list_varying = [
    # CPU - 3 sentences
    "The Central Processing Unit (CPU) is the brain of any computer, tasked with executing commands from software applications.",
    "Every CPU, or Central Processing Unit, has a clock speed, measured in GHz, which influences how fast it can process information.",
    "In modern computing, the efficiency of a Central Processing Unit (CPU) , the Central Processing Unit, significantly impacts overall system performance.",
    # RAM - 2 sentences
    "Random Access Memory (RAM) provides the necessary workspace for your computer's CPU to process data, affecting speed and multitasking capabilities.",
    "Upgrading the RAM, or Random Access Memory, can lead to noticeable improvements in a computer's responsiveness and ability to run multiple applications simultaneously.",
    # GPS - 1 sentence
    "The Global Positioning System (GPS) allows devices to determine their exact location on Earth, crucial for navigation and mapping services.",
    # LED - 4 sentences
    "Light Emitting Diodes (LEDs) are widely used in displays, signage, and general lighting due to their energy efficiency and longevity.",
    "Compared to traditional bulbs, LEDs, or Light Emitting Diodes, consume significantly less electricity, making them a cost-effective lighting solution.",
    "The versatility of LED technology, or Light Emitting Diodes, has seen its application extend from simple indicators to full-scale digital displays.",
    "Innovations in LED, Light Emitting Diode technology, continue to push the boundaries of what is possible with lighting, impacting various industries.",
    # DNA - 2 sentences
    "Deoxyribonucleic Acid (DNA) holds the genetic blueprint for the development, functioning, and reproduction of living organisms.",
    "Recent advancements in DNA sequencing technology have revolutionized our understanding of genetics, enabling detailed analysis of the Deoxyribonucleic Acid in various species.",
    # mixed
    sample_text
]
df = pd.DataFrame(text_list_varying, columns=['Text'])

In [8]:
vulture = Vulture(n_jobs=-1, verbose=True)
dataframe_clean_args = {
    "df": df,
    "steps": vulture.DEFAULT_PIPELINE,
    "substitutions": None,
    "columns": ['Text'],
    "append_to_original_df": True,
    "concat_cleaned_cols": True,
}
df = vulture.clean_dataframe(**dataframe_clean_args) 
df.info()

[Vulture]: Cleaning 13 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module
[Parallel(n_jobs=12)]: Using backend MultiprocessingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   4 out of  13 | elapsed:   24.0s remaining:   54.0s
[Parallel(n_jobs=12)]: Done  13 out of  13 | elapsed:  1.0min finished
100%|██████████| 1/1 [01:02<00:00, 62.77s/it]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Text        13 non-null     object
 1   clean_Text  13 non-null     object
dtypes: object(2)
memory usage: 340.0+ bytes





In [9]:
find_acronyms(  df, 
                acronyms_savepath=".", 
                exclude_terms=None, 
                grams_to_n=7, 
                column="clean_Text", 
                custom_acronyms=None, 
                weight=2,
                sort_wordcount=False,
                save_grams=True
            )

100%|██████████| 13/13 [00:00<00:00, 36109.90it/s]
100%|██████████| 13/13 [00:00<00:00, 20724.42it/s]
100%|██████████| 13/13 [00:00<00:00, 39397.36it/s]
100%|██████████| 13/13 [00:00<00:00, 38780.90it/s]
100%|██████████| 13/13 [00:00<00:00, 56445.08it/s]
100%|██████████| 13/13 [00:00<00:00, 62458.14it/s]
100%|██████████| 13/13 [00:00<00:00, 37346.54it/s]


Unnamed: 0,word,subs,tf,df,weight
0,central processing unit cpu,central_processing_unit,2.0,2.0,2
1,cpu,central_processing_unit,2.0,2.0,2
2,central processing unit,central_processing_unit,2.0,2.0,2
3,random access memory ram,random_access_memory,2.0,2.0,2
4,ram,random_access_memory,2.0,2.0,2
5,random access memory,random_access_memory,2.0,2.0,2
6,search engine optimization seo,search_engine_optimization,1.0,1.0,2
7,seo,search_engine_optimization,1.0,1.0,2
8,search engine optimization,search_engine_optimization,1.0,1.0,2
9,automated teller machine atm,automated_teller_machine,1.0,1.0,2
