# Vulture

## Introduction to Text Operations with Vulture

In [1]:
import os
import pickle
import pathlib
import pandas as pd

from TELF.pre_processing import Vulture
from TELF.pre_processing.Vulture.modules import AcronymDetector
from TELF.pre_processing.Vulture.modules import SimpleCleaner

## 0. Load Dataset

### Input

In [2]:
DATA_DIR = os.path.join('..', '..', 'data')
DATA_DIR = pathlib.Path(DATA_DIR).resolve()
DATA_FILE = 'acronyms_documents.p'
documents = pickle.load(open(os.path.join(DATA_DIR, DATA_FILE), 'rb'))
len(documents)

11

In [3]:
documents

{'ID_1': "In our discussion about Global Warming (GW), we should also consider Sustainable Energy Sources (SES). While discussing SES, it's vital to mention Carbon Footprint Reduction (CFR) and its impacts on Environmental Sustainability (ES). Let's not forget the role of Innovative Technology Solutions (ITS).",
 'ID_2': "The project's success hinges on Effective Team Collaboration (ETC), robust Project Management Frameworks (PMF), and the use of Advanced Analytics Tools (AAT). Furthermore, Continuous Improvement Processes (CIP) and Strategic Planning Initiatives (SPI) are key.",
 'ID_3': "Digital Transformation (DT) involves more than just technology. It's about Customer Engagement Strategies (CES) and optimizing the User Experience (UX). Moreover, Data Privacy Regulations (DPR) and Cybersecurity Measures (CM) are foundational.",
 'ID_4': 'Financial institutions are focusing on Risk Management Practices (RMP) and Investment Diversification Strategies (IDS). The importance of Regulator

In [4]:
documents['ID_11'] = 'The latest data from NASA, which stands for National Aeronautics and Space Administration, has significantly enriched our understanding of celestial phenomena. Guidelines from the CDC, i.e., Centers for Disease Control and Prevention, have evolved in response to new insights on epidemiology provided by WHO—World Health Organization. In the realm of technology, MIT—Massachusetts Institute of Technology, has been pioneering developments in AI, which stands for artificial intelligence, propelling forward the capabilities of predictive analytics. Furthermore, technological standards that affect global research and development are rigorously maintained by IEEE, i.e., Institute of Electrical and Electronics Engineers. Refedining NASA -- NASA, which stands for National Aeronautics and Space Administration'


documents['ID_12'] = 'This is An Awesome Interesting (AI) challenge to acronym subs'

### Output

In [5]:
RESULTS_DIR = 'results'
RESULTS_DIR = pathlib.Path(RESULTS_DIR).resolve()
RESULTS_FILE = 'operated_documents'
try:
    os.mkdir(RESULTS_DIR)
except FileExistsError:
    pass

In [6]:
AcronymDetector()

AcronymDetector(module_type='OPERATOR', gram_range=[2, 3, 4, 5, 6, 7], current_document_id=None, replace_raw=False, join_with='_')

In [7]:
Vulture.DEFAULT_OPERATOR_PIPELINE

[NEDetector(module_type='OPERATOR', backend=None)]

### Setup Vulture

Create a single-node multi-process Vulture object

In [8]:

from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES

In [9]:
vulture = Vulture(n_jobs  = 1, 
                  verbose = 10,  # Disable == 0, Verbose >= 1
                 )
steps = [SimpleCleaner( stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
            order = [
                'standardize_hyphens',
                'remove_stop_phrases',
                'isolate_frozen',
                'remove_copyright_statement',
                'make_lower_case',
                'remove_formulas',
                'normalize',
                'remove_next_line',
                'remove_email',
                'remove_()',
                'remove_[]',
                'remove_special_characters',
                'remove_nonASCII_boundary',
                'remove_nonASCII',
                'remove_tags',
                'remove_stop_words',
                'remove_standalone_numbers',
                'remove_extra_whitespace',
                'min_characters',
        ])]
%time vulture.clean(documents, \
                    steps=steps,\
                    save_path=os.path.join(RESULTS_DIR, "clean_documents"))         

clean_documents = pickle.load(open(os.path.join(RESULTS_DIR, "clean_documents"), 'rb'))

[Vulture]: Cleaning 12 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module
100%|██████████| 12/12 [00:00<00:00, 2372.57it/s]
100%|██████████| 1/1 [00:00<00:00, 135.72it/s]

CPU times: user 10.9 ms, sys: 1.12 ms, total: 12 ms
Wall time: 10.1 ms





In [10]:
vulture.operate(clean_documents, steps=[AcronymDetector(replace_raw=True)], save_path=RESULTS_DIR, file_name=RESULTS_FILE)                   

[Vulture]: Cleaning 12 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running AcronymDetector module
100%|██████████| 12/12 [00:00<00:00, 668.65it/s]
100%|██████████| 1/1 [00:00<00:00, 50.85it/s]


Each entry is a tuple where index 0 is the name of the operation and index 1 is the results of the operation in dictionary format.

In [11]:
saved_file = ! ls $RESULTS_DIR
saved_file

['clean_documents', 'operated_documents_AcronymDetector.p']

### Look at Cleaned Documents

In [12]:
operated_documents = pickle.load(open(os.path.join(RESULTS_DIR, saved_file[1]), 'rb'))

In [13]:
operated_documents

{'ID_1': {'Acronyms': {'global warming': 'gw',
   'carbon footprint reduction': 'cfr'},
  'replaced_text': 'discussion  global_warming sustainable sources ses discussing ses vital mention  carbon_footprint_reduction impacts environmental sustainability forget role innovative technology solutions'},
 'ID_2': {'Acronyms': {'project management frameworks': 'pmf',
   'advanced analytics tools': 'aat',
   'continuous improvement processes': 'cip',
   'strategic planning initiatives': 'spi'},
  'replaced_text': 'projects success hinges team collaboration robust  project_management_frameworks  advanced_analytics_tools  continuous_improvement_processes  strategic_planning_initiatives key'},
 'ID_3': {'Acronyms': {'customer engagement strategies': 'ces'},
  'replaced_text': 'digital transformation involves technology  customer_engagement_strategies optimizing user experience ux privacy regulations dpr cybersecurity measures foundational'},
 'ID_4': {'Acronyms': {'risk management practices': 'rm

In [14]:
def to_df(documents, operated_documents):
    data = {
        'id': [],
        'text': [],
        'acronyms': [],
        'acronym_replaced_text': [],

    }

    for _id, text in documents.items():
        data['id'].append(_id)
        data['text'].append(text)

        data['acronyms'].append(operated_documents.get(_id).get('Acronyms'))
        data['acronym_replaced_text'].append(operated_documents.get(_id).get('replaced_text'))

    return pd.DataFrame.from_dict(data)

In [15]:
df = to_df(documents, operated_documents)
df

Unnamed: 0,id,text,acronyms,acronym_replaced_text
0,ID_1,"In our discussion about Global Warming (GW), w...","{'global warming': 'gw', 'carbon footprint red...",discussion global_warming sustainable sources...
1,ID_2,The project's success hinges on Effective Team...,"{'project management frameworks': 'pmf', 'adva...",projects success hinges team collaboration rob...
2,ID_3,Digital Transformation (DT) involves more than...,{'customer engagement strategies': 'ces'},digital transformation involves technology cu...
3,ID_4,Financial institutions are focusing on Risk Ma...,"{'risk management practices': 'rmp', 'investme...",financial institutions focusing risk_manageme...
4,ID_5,"In healthcare, Patient Care Coordination (PCC)...","{'patient care coordination': 'pcc', 'electron...",healthcare patient_care_coordination electro...
5,ID_6,Educational Technology (ET) is reshaping learn...,"{'interactive learning platforms': 'ilp', 'ada...",educational technology reshaping learning int...
6,ID_7,The entertainment industry values Creative Con...,"{'digital distribution platforms': 'ddp', 'aud...",entertainment industry creative content ccp d...
7,ID_8,Sustainable Agriculture Practices (SAP) and Pr...,"{'sustainable agriculture practices': 'sap', '...",sustainable_agriculture_practices precision_...
8,ID_9,Urban Planning (UP) incorporates Green Infrast...,"{'zoning regulations': 'zr', 'public transport...",urban planning incorporates green infrastructu...
9,ID_10,"To Be Determined (TBD), National Aeronautics a...","{'central processing unit': 'cpu', 'frequently...",determined tbd national_aeronautics_space_adm...


In [16]:
import textwrap
wrapped_text = textwrap.fill(df.iloc[10].text, width=90)
print(wrapped_text)

The latest data from NASA, which stands for National Aeronautics and Space Administration,
has significantly enriched our understanding of celestial phenomena. Guidelines from the
CDC, i.e., Centers for Disease Control and Prevention, have evolved in response to new
insights on epidemiology provided by WHO—World Health Organization. In the realm of
technology, MIT—Massachusetts Institute of Technology, has been pioneering developments in
AI, which stands for artificial intelligence, propelling forward the capabilities of
predictive analytics. Furthermore, technological standards that affect global research and
development are rigorously maintained by IEEE, i.e., Institute of Electrical and
Electronics Engineers. Refedining NASA -- NASA, which stands for National Aeronautics and
Space Administration


In [17]:
df.iloc[10].acronyms

{'artificial intelligence': 'ai',
 'centers disease control': 'cdc',
 'national aeronautics space administration': 'nasa',
 'institute electrical electronics engineers': 'ieee'}

In [18]:

wrapped_text = textwrap.fill(df.iloc[10].acronym_replaced_text, width=90)
print(wrapped_text)

latest national_aeronautics_space_administration  enriched understanding celestial
phenomena guidelines centers_disease_control  prevention evolved response insights
epidemiology provided health organization realm technology mit-massachusetts institute
technology pioneering developments artificial_intelligence  propelling forward
capabilities predictive analytics technological standards affect global development
rigorously maintained institute_electrical_electronics_engineers  refedining
national_aeronautics_space_administration national_aeronautics_space_administration


In [19]:
wrapped_text = textwrap.fill(df.iloc[9].text, width=90)
print(wrapped_text)

To Be Determined (TBD), National Aeronautics and Space Administration: NASA, Self-
Contained Underwater Breathing Apparatus- SCUBA, Light Amplification by Stimulated
Emission of Radiation—LASER, RADAR (Radio Detection And Ranging), ASAP: As Soon As
Possible, CPU - Central Processing Unit, DIY—Do It Yourself, Frequently Asked Questions
(i.e., FAQ), GIF (Graphics Interchange Format), HTTP: Hypertext Transfer Protocol, JSON -
JavaScript Object Notation, KPI—Key Performance Indicator, Light Emitting Diode, namely
LED, Magnetic Resonance Imaging, i.e., MRI, OLED, which stands for Organic Light-Emitting
Diode


In [20]:
df.iloc[9].acronyms

{'central processing unit': 'cpu',
 'frequently asked questions': 'faq',
 'magnetic resonance imaging': 'mri',
 'national aeronautics space administration': 'nasa'}

In [21]:
wrapped_text = textwrap.fill(df.iloc[9].acronym_replaced_text, width=90)
print(wrapped_text)

determined tbd  national_aeronautics_space_administration underwater breathing apparatus
scuba amplification stimulated emission radiation-laser radar radio ranging asap
central_processing_unit   frequently_asked_questions gif graphics interchange hypertext
transfer protocol json javascript object notation kpi-key indicator emitting diode led
magnetic_resonance_imaging oled organic diode


# Instead of Vulture Operator substitutions, Vulture Clean substitutions

In [22]:
vulture = Vulture(n_jobs  = 1, 
                  verbose = 10,  # Disable == 0, Verbose >= 1
                 )
steps = [SimpleCleaner( stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
            order = [
                'standardize_hyphens',
                'remove_stop_phrases',
                'isolate_frozen',
                'remove_copyright_statement',
                'make_lower_case',
                'remove_formulas',
                'normalize',
                'remove_next_line',
                'remove_email',
                'remove_()',
                'remove_[]',
                'remove_special_characters',
                'remove_nonASCII_boundary',
                'remove_nonASCII',
                'remove_tags',
                'remove_stop_words',
                'remove_standalone_numbers',
                'remove_extra_whitespace',
                'min_characters',
        ])]

In [23]:
substitutions = {}
for id, acronym_data in operated_documents.items():
    for src_txt, acronym in acronym_data['Acronyms'].items():
        # print(src_txt, acronym)
        sub_to = '_'.join(src_txt.split())
        substitutions[src_txt] = sub_to
        substitutions[acronym] = sub_to

for src, sub in substitutions.items():
    print(f'{src} : {sub}')


global warming : global_warming
gw : global_warming
carbon footprint reduction : carbon_footprint_reduction
cfr : carbon_footprint_reduction
project management frameworks : project_management_frameworks
pmf : project_management_frameworks
advanced analytics tools : advanced_analytics_tools
aat : advanced_analytics_tools
continuous improvement processes : continuous_improvement_processes
cip : continuous_improvement_processes
strategic planning initiatives : strategic_planning_initiatives
spi : strategic_planning_initiatives
customer engagement strategies : customer_engagement_strategies
ces : customer_engagement_strategies
risk management practices : risk_management_practices
rmp : risk_management_practices
investment diversification strategies : investment_diversification_strategies
ids : investment_diversification_strategies
financial technology innovations : financial_technology_innovations
fti : financial_technology_innovations
consumer financial protection : consumer_financial_pro

In [24]:
substitutions

{'global warming': 'global_warming',
 'gw': 'global_warming',
 'carbon footprint reduction': 'carbon_footprint_reduction',
 'cfr': 'carbon_footprint_reduction',
 'project management frameworks': 'project_management_frameworks',
 'pmf': 'project_management_frameworks',
 'advanced analytics tools': 'advanced_analytics_tools',
 'aat': 'advanced_analytics_tools',
 'continuous improvement processes': 'continuous_improvement_processes',
 'cip': 'continuous_improvement_processes',
 'strategic planning initiatives': 'strategic_planning_initiatives',
 'spi': 'strategic_planning_initiatives',
 'customer engagement strategies': 'customer_engagement_strategies',
 'ces': 'customer_engagement_strategies',
 'risk management practices': 'risk_management_practices',
 'rmp': 'risk_management_practices',
 'investment diversification strategies': 'investment_diversification_strategies',
 'ids': 'investment_diversification_strategies',
 'financial technology innovations': 'financial_technology_innovations'

In [27]:
dataframe_clean_args = {
    "df": df,
    "steps": steps,
    "substitutions": substitutions.copy(),
    "columns": ['text',],
    "append_to_original_df": True,
    "concat_cleaned_cols": True,
}

df = vulture.clean_dataframe(**dataframe_clean_args) 
df.info()

NameError: name 'substitutions' is not defined

In [26]:
df.head(60)

Unnamed: 0,id,text,acronyms,acronym_replaced_text,clean_text
0,ID_1,"In our discussion about Global Warming (GW), w...","{'global warming': 'gw', 'carbon footprint red...",discussion global_warming sustainable sources...,discussion global warming gw sustainable sourc...
1,ID_2,The project's success hinges on Effective Team...,"{'project management frameworks': 'pmf', 'adva...",projects success hinges team collaboration rob...,projects success hinges team collaboration rob...
2,ID_3,Digital Transformation (DT) involves more than...,{'customer engagement strategies': 'ces'},digital transformation involves technology cu...,digital transformation involves technology cus...
3,ID_4,Financial institutions are focusing on Risk Ma...,"{'risk management practices': 'rmp', 'investme...",financial institutions focusing risk_manageme...,financial institutions focusing risk managemen...
4,ID_5,"In healthcare, Patient Care Coordination (PCC)...","{'patient care coordination': 'pcc', 'electron...",healthcare patient_care_coordination electro...,healthcare patient care coordination pcc elect...
5,ID_6,Educational Technology (ET) is reshaping learn...,"{'interactive learning platforms': 'ilp', 'ada...",educational technology reshaping learning int...,educational technology reshaping learning inte...
6,ID_7,The entertainment industry values Creative Con...,"{'digital distribution platforms': 'ddp', 'aud...",entertainment industry creative content ccp d...,entertainment industry creative content ccp di...
7,ID_8,Sustainable Agriculture Practices (SAP) and Pr...,"{'sustainable agriculture practices': 'sap', '...",sustainable_agriculture_practices precision_...,sustainable agriculture practices sap precisio...
8,ID_9,Urban Planning (UP) incorporates Green Infrast...,"{'zoning regulations': 'zr', 'public transport...",urban planning incorporates green infrastructu...,urban planning incorporates green infrastructu...
9,ID_10,"To Be Determined (TBD), National Aeronautics a...","{'central processing unit': 'cpu', 'frequently...",determined tbd national_aeronautics_space_adm...,determined tbd national aeronautics space admi...
