# Vulture

## Introduction to Text Pre-Processing with Vulture

In [1]:
from TELF.pre_processing.Vulture.modules import VultureModuleBase

vmb = VultureModuleBase()
print(vmb)

VultureModuleBase()


In [2]:
import os

In [3]:
import os
import pickle
import pathlib
import pandas as pd

from TELF.pre_processing import Vulture

## 0. Load Dataset

### Input

In [4]:
DATA_DIR = os.path.join('..', '..', 'data')
DATA_DIR = pathlib.Path(DATA_DIR).resolve()

In [5]:
DATA_FILE = 'documents.p'

In [6]:
documents = pickle.load(open(os.path.join(DATA_DIR, DATA_FILE), 'rb'))
len(documents)

9

### Output

In [7]:
RESULTS_DIR = 'results'
RESULTS_DIR = pathlib.Path(RESULTS_DIR).resolve()

In [8]:
RESULTS_FILE = 'clean_documents.p'

In [9]:
try:
    os.mkdir(RESULTS_DIR)
except FileExistsError:
    pass

### Examine Data Format

In [10]:
# key serve as document unique ids
list(documents.keys())

['ad68055e-677f-11ee-95d4-4ab2673ea3f0',
 'ad680626-677f-11ee-95d4-4ab2673ea3f0',
 'ad680658-677f-11ee-95d4-4ab2673ea3f0',
 'ad680680-677f-11ee-95d4-4ab2673ea3f0',
 'ad6806a8-677f-11ee-95d4-4ab2673ea3f0',
 'ad6806d0-677f-11ee-95d4-4ab2673ea3f0',
 'ad6806f8-677f-11ee-95d4-4ab2673ea3f0',
 'ad680716-677f-11ee-95d4-4ab2673ea3f0',
 'ad68073e-677f-11ee-95d4-4ab2673ea3f0']

In [11]:
# values are the text that needs to be cleaned
documents[next(iter(documents))]

'Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected 

## 1. Clean

The Vulture library is composed of multiple cleaning modules that work together to process the text. These modules are flexible and their order can be re-arranged depending on the user's preferences. By default Vulture implements a simple cleaning pipeline so that new users can quickly get started. In this section we will examine the Vulture default pipeline and apply the default cleaning to the sample text.

The pipeline is a just list of Vulture modules that are to be updated sequentially. The default pipeline contains a single module - the ```SimpleCleaner```.

In [12]:
Vulture.DEFAULT_PIPELINE

[SimpleCleaner(effective_stop_words=['characteristics', 'acknowledgment', 'characteristic', 'predominantly', 'investigation', 'automatically', 'corresponding', 'approximately', 'significantly', 'substantially', 'unfortunately', 'demonstrated', 'applications', 'demonstrates', 'consequently', 'successfully', 'sufficiently', 'particularly', 'nevertheless', 'introduction', 'specifically', 'respectively', 'representing', 'demonstrate', 'investigate', ... (+1359 more)], patterns={'standardize_hyphens': (re.compile('[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\u2212\\u2E3A\\u2E3B]'), '-'), 'remove_copyright_statement': None, 'remove_stop_phrases': None, 'make_lower_case': None, 'normalize': None, 'remove_trailing_dash': ('(?<!\\w)-|-(?!\\w)', ''), 'make_hyphens_words': ('([a-z])\\-([a-z])', ''), 'remove_next_line': ('\\n+', ' '), 'remove_email': ('\\S*@\\S*\\s?', ''), 'remove_formulas': ('\\b\\w*[\\=\\≈\\/\\\\\\±]\\w*\\b', ''), 'remove_dash': ('-', ''), 'remove_between_[]': ('\\[.*?\\]

The most important attribute of the ```SimpleCleaner``` is ```order```. This attribute specifies which cleaning operations will be applied as well as the order that they will be applied in. These cleaning operations are all pre-defined and the full list of supported operations can be examined with ```SimpleCleaner.DEFAULT_PATTERNS```

In [13]:
Vulture.DEFAULT_PIPELINE[0].order

['standardize_hyphens',
 'isolate_frozen',
 'remove_copyright_statement',
 'remove_stop_phrases',
 'make_lower_case',
 'remove_formulas',
 'normalize',
 'remove_next_line',
 'remove_email',
 'remove_()',
 'remove_[]',
 'remove_special_characters',
 'remove_nonASCII_boundary',
 'remove_nonASCII',
 'remove_tags',
 'remove_stop_words',
 'remove_standalone_numbers',
 'remove_extra_whitespace',
 'min_characters']

### Setup Vulture

Create a single-node single-process Vulture object

In [14]:
vulture = Vulture(n_jobs  = 1, 
                  verbose = 10,  # Disable == 0, Verbose >= 1
                 )

### Apply Default Clean

In [15]:
%time vulture.clean(documents, \
                    save_path=os.path.join(RESULTS_DIR, RESULTS_FILE))                   

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 128.07it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.06it/s]

CPU times: user 82.9 ms, sys: 33.7 ms, total: 117 ms
Wall time: 115 ms





### Look at Cleaned Documents

In [16]:
clean_documents = pickle.load(open(os.path.join(RESULTS_DIR, RESULTS_FILE), 'rb'))

In [17]:
def to_df(documents, clean_documents):
    data = {
        'id': [],
        'text': [],
        'clean_text': []
    }

    for i, text in documents.items():
        data['id'].append(i)
        data['text'].append(text)
        data['clean_text'].append(clean_documents.get(i))

    return pd.DataFrame.from_dict(data)

In [18]:
df = to_df(documents, clean_documents)
df

Unnamed: 0,id,text,clean_text
0,ad68055e-677f-11ee-95d4-4ab2673ea3f0,Supervisory Control and Data Acquisition (SCAD...,supervisory control acquisition scada serve ne...
1,ad680626-677f-11ee-95d4-4ab2673ea3f0,Highly specific datasets of scientific literat...,highly specific scientific literature educatio...
2,ad680658-677f-11ee-95d4-4ab2673ea3f0,We propose an efficient distributed out-of-mem...,efficient distributed implementation matrix fa...
3,ad680680-677f-11ee-95d4-4ab2673ea3f0,Identification of the family to which a malwar...,identification family malware specimen belongs...
4,ad6806a8-677f-11ee-95d4-4ab2673ea3f0,Malware is one of the most dangerous and costl...,malware dangerous costly cyber threats nationa...
5,ad6806d0-677f-11ee-95d4-4ab2673ea3f0,Malware is one of the most dangerous and costl...,malware dangerous costly cyber threats organiz...
6,ad6806f8-677f-11ee-95d4-4ab2673ea3f0,Topic modeling is one of the key analytic tech...,topic modeling key analytic techniques organiz...
7,ad680716-677f-11ee-95d4-4ab2673ea3f0,Non-negative matrix factorization (NMF) with m...,matrix factorization nmf completion collaborat...
8,ad68073e-677f-11ee-95d4-4ab2673ea3f0,"We propose an efficient, distributed, out-of-m...",efficient distributed implementation truncated...


#### Look at First Document Full-Text

In [19]:
index = 0
df.iloc[index].text, '', df.iloc[index].clean_text

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

### Add Some Substitutions

In [20]:
substitutions = {
    'los alamos national laboratory': 'LANL',
    'lanl': 'LANL',
    'los alamos': 'Los_Alamos',
    'supervisory control and data acquisition': 'SCADA',
    'scada': 'SCADA',
}

In [21]:
clean_documents_sub = vulture.clean(documents, 
                                    save_path=None, # if None, dict is returned from function
                                    substitutions=substitutions)    

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/3 [00:00<?, ?it/s][Vulture]: Running SubstitutionCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 670.09it/s][A
[Vulture]: Running SimpleCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 157.37it/s][A
[Vulture]: Running SubstitutionCleaner module

100%|███████████████████████████████████████████| 9/9 [00:00<00:00, 1328.43it/s][A
100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 34.24it/s]


In [22]:
df_sub = to_df(documents, clean_documents_sub)
df_sub

Unnamed: 0,id,text,clean_text
0,ad68055e-677f-11ee-95d4-4ab2673ea3f0,Supervisory Control and Data Acquisition (SCAD...,SCADA SCADA serve nervous substations power gr...
1,ad680626-677f-11ee-95d4-4ab2673ea3f0,Highly specific datasets of scientific literat...,highly specific scientific literature educatio...
2,ad680658-677f-11ee-95d4-4ab2673ea3f0,We propose an efficient distributed out-of-mem...,efficient distributed implementation matrix fa...
3,ad680680-677f-11ee-95d4-4ab2673ea3f0,Identification of the family to which a malwar...,identification family malware specimen belongs...
4,ad6806a8-677f-11ee-95d4-4ab2673ea3f0,Malware is one of the most dangerous and costl...,malware dangerous costly cyber threats nationa...
5,ad6806d0-677f-11ee-95d4-4ab2673ea3f0,Malware is one of the most dangerous and costl...,malware dangerous costly cyber threats organiz...
6,ad6806f8-677f-11ee-95d4-4ab2673ea3f0,Topic modeling is one of the key analytic tech...,topic modeling key analytic techniques organiz...
7,ad680716-677f-11ee-95d4-4ab2673ea3f0,Non-negative matrix factorization (NMF) with m...,matrix factorization nmf completion collaborat...
8,ad68073e-677f-11ee-95d4-4ab2673ea3f0,"We propose an efficient, distributed, out-of-m...",efficient distributed implementation truncated...


In [23]:
df_sub.iloc[index].text, '', df_sub.iloc[index].clean_text

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

## 2. Look at Top Words

In [24]:
from TELF.pre_processing.Vulture.tokens_analysis.top_words import get_top_words

### Before Substitution

In [25]:
top_words_df = get_top_words(clean_documents, 
                             top_n=100, 
                             n_gram=1, 
                             verbose=True,
                             filename=None)
top_words_df

100%|███████████████████████████████████████████| 9/9 [00:00<00:00, 6854.68it/s]


Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,malware,41,3,0.333333,0.077947
1,matrix,21,6,0.666667,0.039924
2,scada,9,1,0.111111,0.017110
3,family,9,3,0.333333,0.017110
4,topics,8,2,0.222222,0.015209
...,...,...,...,...,...
95,events,2,1,0.111111,0.003802
96,framework,2,2,0.222222,0.003802
97,accuracy,2,2,0.222222,0.003802
98,electrical,2,1,0.111111,0.003802


In [26]:
top_2grams_df = get_top_words(clean_documents, 
                             top_n=10, 
                             n_gram=2, 
                             verbose=True,
                             filename=None,)
top_2grams_df

100%|███████████████████████████████████████████| 9/9 [00:00<00:00, 6578.73it/s]


Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,malware families,7,3,0.333333,0.008083
1,malware family,6,3,0.333333,0.006928
2,matrix factorization,5,5,0.555556,0.005774
3,factorization nmf,4,4,0.444444,0.004619
4,machine learning,4,3,0.333333,0.004619
5,topic modeling,4,2,0.222222,0.004619
6,dense matrix,4,2,0.222222,0.004619
7,tensor decomposition,3,1,0.111111,0.003464
8,sparse matrix,3,2,0.222222,0.003464
9,memory complexity,3,2,0.222222,0.003464


### After Substitution

In [27]:
top_words_df = get_top_words(clean_documents_sub, 
                             top_n=100, 
                             n_gram=1, 
                             verbose=True,
                             filename=None)
top_words_df

100%|██████████████████████████████████████████| 9/9 [00:00<00:00, 10010.27it/s]


Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,malware,41,3,0.333333,0.078244
1,matrix,21,6,0.666667,0.040076
2,SCADA,10,1,0.111111,0.019084
3,family,9,3,0.333333,0.017176
4,topics,8,2,0.222222,0.015267
...,...,...,...,...,...
95,electrical,2,1,0.111111,0.003817
96,LANL,2,1,0.111111,0.003817
97,specific,2,1,0.111111,0.003817
98,scientific,2,1,0.111111,0.003817


In [28]:
top_2grams_df = get_top_words(clean_documents_sub, 
                             top_n=10, 
                             n_gram=2, 
                             verbose=True,
                             filename=None,)
top_2grams_df

100%|███████████████████████████████████████████| 9/9 [00:00<00:00, 6912.42it/s]


Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,malware families,7,3,0.333333,0.00813
1,malware family,6,3,0.333333,0.006969
2,matrix factorization,5,5,0.555556,0.005807
3,factorization nmf,4,4,0.444444,0.004646
4,machine learning,4,3,0.333333,0.004646
5,topic modeling,4,2,0.222222,0.004646
6,dense matrix,4,2,0.222222,0.004646
7,tensor decomposition,3,1,0.111111,0.003484
8,sparse matrix,3,2,0.222222,0.003484
9,memory complexity,3,2,0.222222,0.003484
