# Vulture

## Introduction to Text Operations with Vulture

In [1]:
import os
import pickle
import pathlib
import pandas as pd

from TELF.pre_processing import Vulture
from TELF.pre_processing.Vulture.modules import AcronymDetector
from TELF.pre_processing.Vulture.modules import SimpleCleaner

## 0. Load Dataset

### Input

In [2]:
DATA_DIR = os.path.join('..', '..', 'data')
DATA_DIR = pathlib.Path(DATA_DIR).resolve()
DATA_FILE = 'acronyms_documents.p'
documents = pickle.load(open(os.path.join(DATA_DIR, DATA_FILE), 'rb'))
len(documents)

10

In [3]:
documents

{'ID_1': "In our discussion about Global Warming (GW), we should also consider Sustainable Energy Sources (SES). While discussing SES, it's vital to mention Carbon Footprint Reduction (CFR) and its impacts on Environmental Sustainability (ES). Let's not forget the role of Innovative Technology Solutions (ITS).",
 'ID_2': "The project's success hinges on Effective Team Collaboration (ETC), robust Project Management Frameworks (PMF), and the use of Advanced Analytics Tools (AAT). Furthermore, Continuous Improvement Processes (CIP) and Strategic Planning Initiatives (SPI) are key.",
 'ID_3': "Digital Transformation (DT) involves more than just technology. It's about Customer Engagement Strategies (CES) and optimizing the User Experience (UX). Moreover, Data Privacy Regulations (DPR) and Cybersecurity Measures (CM) are foundational.",
 'ID_4': 'Financial institutions are focusing on Risk Management Practices (RMP) and Investment Diversification Strategies (IDS). The importance of Regulator

### Output

In [5]:
RESULTS_DIR = 'results'
RESULTS_DIR = pathlib.Path(RESULTS_DIR).resolve()
RESULTS_FILE = 'operated_documents'
try:
    os.mkdir(RESULTS_DIR)
except FileExistsError:
    pass

In [11]:
AcronymDetector()

AcronymDetector(module_type='OPERATOR', gram_range=[2, 3, 4, 5, 6, 7], current_document_id=None, replace_raw=False)

In [12]:
Vulture.DEFAULT_OPERATOR_PIPELINE

[NEDetector(module_type='OPERATOR', backend=None)]

### Setup Vulture

Create a single-node multi-process Vulture object

In [13]:
vulture = Vulture(n_jobs  = 1, 
                  verbose = 10,  # Disable == 0, Verbose >= 1
                 )
steps = [SimpleCleaner( 
            order = [
                'standardize_hyphens',
                'isolate_frozen',
                'remove_copyright_statement',
                'remove_stop_phrases',
                'make_lower_case',
                'remove_formulas',
                'normalize',
                'remove_next_line',
                'remove_email',
                'remove_()',
                'remove_[]',
                'remove_special_characters',
                'remove_nonASCII_boundary',
                'remove_nonASCII',
                'remove_tags',
                'remove_stop_words',
                'remove_standalone_numbers',
                'remove_extra_whitespace',
                'min_characters',
        ])]
%time vulture.clean(documents, \
                    steps=steps,\
                    save_path=os.path.join(RESULTS_DIR, "clean_documents"))         

clean_documents = pickle.load(open(os.path.join(RESULTS_DIR, "clean_documents"), 'rb'))

[Vulture]: Cleaning 10 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module
100%|██████████| 10/10 [00:00<00:00, 5407.82it/s]
100%|██████████| 1/1 [00:00<00:00, 262.77it/s]

CPU times: user 1.97 ms, sys: 6.4 ms, total: 8.37 ms
Wall time: 6.85 ms





In [15]:
vulture.operate(clean_documents, steps=[AcronymDetector(replace_raw=True)], save_path=RESULTS_DIR, file_name=RESULTS_FILE)                   

[Vulture]: Cleaning 10 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running AcronymDetector module
100%|██████████| 10/10 [00:00<00:00, 584.73it/s]
100%|██████████| 1/1 [00:00<00:00, 51.74it/s]


Each entry is a tuple where index 0 is the name of the operation and index 1 is the results of the operation in dictionary format.

In [17]:
saved_file = ! ls $RESULTS_DIR
saved_file

['clean_documents',
 'operated_documents_AcronymDetector.p',
 'operated_documents_NEDetector.p']

### Look at Cleaned Documents

In [18]:
operated_documents = pickle.load(open(os.path.join(RESULTS_DIR, saved_file[1]), 'rb'))

In [19]:
operated_documents

{'ID_1': {'Acronyms': {'global warming': 'gw',
   'environmental sustainability': 'es',
   'sustainable energy sources': 'ses',
   'carbon footprint reduction': 'cfr',
   'innovative technology solutions': 'its'},
  'replaced_text': 'in our discussion about global_warming we should also consider sustainable energy sourcenvironmental_sustainability senvironmental_sustainability while discussing senvironmental_sustainability innovative_technology_solutions vital to mention carbon_footprint_reduction and innovative_technology_solutions impacts on environmental_sustainability lets not forget the role of innovative_technology_solutions'},
 'ID_2': {'Acronyms': {'effective team collaboration': 'etc',
   'project management frameworks': 'pmf',
   'advanced analytics tools': 'aat',
   'continuous improvement processes': 'cip',
   'strategic planning initiatives': 'spi'},
  'replaced_text': 'the projects success hinges on effective_team_collaboration robust project_management_frameworks and the

In [21]:
def to_df(documents, operated_documents):
    data = {
        'id': [],
        'text': [],
        'acronyms': [],
        'acronym_replaced_text': [],

    }

    for _id, text in documents.items():
        data['id'].append(_id)
        data['text'].append(text)

        data['acronyms'].append(operated_documents.get(_id).get('Acronyms'))
        data['acronym_replaced_text'].append(operated_documents.get(_id).get('replaced_text'))

    return pd.DataFrame.from_dict(data)

In [22]:
df = to_df(documents, operated_documents)
df

Unnamed: 0,id,text,acronyms,acronym_replaced_text
0,ID_1,"In our discussion about Global Warming (GW), w...","{'global warming': 'gw', 'environmental sustai...",in our discussion about global_warming we shou...
1,ID_2,The project's success hinges on Effective Team...,"{'effective team collaboration': 'etc', 'proje...",the projects success hinges on effective_team_...
2,ID_3,Digital Transformation (DT) involves more than...,"{'digital transformation': 'dt', 'cybersecurit...",digital_transformation involves more than jus...
3,ID_4,Financial institutions are focusing on Risk Ma...,"{'regulatory compliance': 'rc', 'risk manageme...",financial institutions are focusing on risk_ma...
4,ID_5,"In healthcare, Patient Care Coordination (PCC)...","{'patient care coordination': 'pcc', 'electron...",in healthcare patient_care_coordination and el...
5,ID_6,Educational Technology (ET) is reshaping learn...,"{'educational technology': 'et', 'interactive ...",educational_technology is reshaping learning ...
6,ID_7,The entertainment industry values Creative Con...,"{'marketing strategies': 'ms', 'creative conte...",the entertainment industry values creative_con...
7,ID_8,Sustainable Agriculture Practices (SAP) and Pr...,"{'sustainable agriculture practices': 'sap', '...",sustainable_agriculture_practices and precisi...
8,ID_9,Urban Planning (UP) incorporates Green Infrast...,"{'urban planning': 'up', 'green infrastructure...",urban_planning incorporates green_infrastruct...
9,various_forms_of _acronyms,"To Be Determined (TBD), National Aeronautics a...","{'as soon': 'as', 'to be determined': 'tbd', '...",to_be_determined national aeronautics and spa...
