# Vulture

## Introduction to Text Operations with Vulture

In [None]:
import os
import pickle
import pathlib
import pandas as pd

from TELF.pre_processing import Vulture
from TELF.pre_processing.Vulture.modules import AcronymDetector
from TELF.pre_processing.Vulture.modules import SimpleCleaner

## 0. Load Dataset

### Input

In [None]:
DATA_DIR = os.path.join('..', '..', 'data')
DATA_DIR = pathlib.Path(DATA_DIR).resolve()
DATA_FILE = 'acronyms_documents.p'
documents = pickle.load(open(os.path.join(DATA_DIR, DATA_FILE), 'rb'))
len(documents)

In [None]:
documents

In [None]:
documents['ID_11'] = 'The latest data from NASA, which stands for National Aeronautics and Space Administration, has significantly enriched our understanding of celestial phenomena. Guidelines from the CDC, i.e., Centers for Disease Control and Prevention, have evolved in response to new insights on epidemiology provided by WHO—World Health Organization. In the realm of technology, MIT—Massachusetts Institute of Technology, has been pioneering developments in AI, which stands for artificial intelligence, propelling forward the capabilities of predictive analytics. Furthermore, technological standards that affect global research and development are rigorously maintained by IEEE, i.e., Institute of Electrical and Electronics Engineers. Refedining NASA -- NASA, which stands for National Aeronautics and Space Administration'


documents['ID_12'] = 'This is An Awesome Interesting (AI) challenge to acronym subs'

### Output

In [None]:
RESULTS_DIR = 'results'
RESULTS_DIR = pathlib.Path(RESULTS_DIR).resolve()
RESULTS_FILE = 'operated_documents'
try:
    os.mkdir(RESULTS_DIR)
except FileExistsError:
    pass

In [None]:
AcronymDetector()

In [None]:
Vulture.DEFAULT_OPERATOR_PIPELINE

### Setup Vulture

Create a single-node multi-process Vulture object

In [None]:

from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES

In [None]:
vulture = Vulture(n_jobs  = 1, 
                  verbose = 10,  # Disable == 0, Verbose >= 1
                 )
steps = [SimpleCleaner( stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
            order = [
                'standardize_hyphens',
                'remove_stop_phrases',
                'isolate_frozen',
                'remove_copyright_statement',
                'make_lower_case',
                'remove_formulas',
                'normalize',
                'remove_next_line',
                'remove_email',
                'remove_()',
                'remove_[]',
                'remove_special_characters',
                'remove_nonASCII_boundary',
                'remove_nonASCII',
                'remove_tags',
                'remove_stop_words',
                'remove_standalone_numbers',
                'remove_extra_whitespace',
                'min_characters',
        ])]
%time vulture.clean(documents, \
                    steps=steps,\
                    save_path=os.path.join(RESULTS_DIR, "clean_documents"))         

clean_documents = pickle.load(open(os.path.join(RESULTS_DIR, "clean_documents"), 'rb'))

In [None]:
vulture.operate(clean_documents, steps=[AcronymDetector(replace_raw=True)], save_path=RESULTS_DIR, file_name=RESULTS_FILE)                   

Each entry is a tuple where index 0 is the name of the operation and index 1 is the results of the operation in dictionary format.

In [None]:
saved_file = ! ls $RESULTS_DIR
saved_file

### Look at Cleaned Documents

In [None]:
operated_documents = pickle.load(open(os.path.join(RESULTS_DIR, saved_file[1]), 'rb'))

In [None]:
operated_documents

In [None]:
def to_df(documents, operated_documents):
    data = {
        'id': [],
        'text': [],
        'acronyms': [],
        'acronym_replaced_text': [],

    }

    for _id, text in documents.items():
        data['id'].append(_id)
        data['text'].append(text)

        data['acronyms'].append(operated_documents.get(_id).get('Acronyms'))
        data['acronym_replaced_text'].append(operated_documents.get(_id).get('replaced_text'))

    return pd.DataFrame.from_dict(data)

In [None]:
df = to_df(documents, operated_documents)
df

In [None]:
import textwrap
wrapped_text = textwrap.fill(df.iloc[10].text, width=90)
print(wrapped_text)

In [None]:
df.iloc[10].acronyms

In [None]:

wrapped_text = textwrap.fill(df.iloc[10].acronym_replaced_text, width=90)
print(wrapped_text)

In [None]:
wrapped_text = textwrap.fill(df.iloc[9].text, width=90)
print(wrapped_text)

In [None]:
df.iloc[9].acronyms

In [None]:
wrapped_text = textwrap.fill(df.iloc[9].acronym_replaced_text, width=90)
print(wrapped_text)

# Instead of Vulture Operator substitutions, Vulture Clean substitutions

In [None]:
vulture = Vulture(n_jobs  = 1, 
                  verbose = 10,  # Disable == 0, Verbose >= 1
                 )
steps = [SimpleCleaner( stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
            order = [
                'standardize_hyphens',
                'remove_stop_phrases',
                'isolate_frozen',
                'remove_copyright_statement',
                'make_lower_case',
                'remove_formulas',
                'normalize',
                'remove_next_line',
                'remove_email',
                'remove_()',
                'remove_[]',
                'remove_special_characters',
                'remove_nonASCII_boundary',
                'remove_nonASCII',
                'remove_tags',
                'remove_stop_words',
                'remove_standalone_numbers',
                'remove_extra_whitespace',
                'min_characters',
        ])]

In [None]:
substitutions = {}
for id, acronym_data in operated_documents.items():
    for src_txt, acronym in acronym_data['Acronyms'].items():
        # print(src_txt, acronym)
        sub_to = '_'.join(src_txt.split())
        substitutions[src_txt] = sub_to
        substitutions[acronym] = sub_to

for src, sub in substitutions.items():
    print(f'{src} : {sub}')


In [None]:
dataframe_clean_args = {
    "df": df,
    "steps": steps,
    "substitutions": substitutions.copy(),
    "columns": ['text',],
    "append_to_original_df": True,
    "concat_cleaned_cols": True,
}

df = vulture.clean_dataframe(**dataframe_clean_args) 
df.info()

In [None]:
df.head(60)