# Vulture

## Advanced Use Cases of Vulture Text Pre-Processing 

In [1]:
import os
import pickle
import pathlib
import pandas as pd

from TELF.pre_processing import Vulture

from TELF.pre_processing.Vulture.modules import SimpleCleaner
from TELF.pre_processing.Vulture.modules import LemmatizeCleaner
from TELF.pre_processing.Vulture.modules import SubstitutionCleaner
from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner

from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES

## 0. Load Dataset

### Input

In [2]:
DATA_DIR = os.path.join('..', '..', 'data')
DATA_DIR = pathlib.Path(DATA_DIR).resolve()

In [3]:
DATA_FILE = 'documents.p'

In [4]:
documents = pickle.load(open(os.path.join(DATA_DIR, DATA_FILE), 'rb'))
len(documents)

9

### Output

In [5]:
RESULTS_DIR = 'results'
RESULTS_DIR = pathlib.Path(RESULTS_DIR).resolve()

In [6]:
RESULTS_FILE = 'clean_documents.p'

In [7]:
try:
    os.mkdir(RESULTS_DIR)
except FileExistsError:
    pass

### Examine Data Format

In [8]:
# key serve as document unique ids
list(documents.keys())

['ad68055e-677f-11ee-95d4-4ab2673ea3f0',
 'ad680626-677f-11ee-95d4-4ab2673ea3f0',
 'ad680658-677f-11ee-95d4-4ab2673ea3f0',
 'ad680680-677f-11ee-95d4-4ab2673ea3f0',
 'ad6806a8-677f-11ee-95d4-4ab2673ea3f0',
 'ad6806d0-677f-11ee-95d4-4ab2673ea3f0',
 'ad6806f8-677f-11ee-95d4-4ab2673ea3f0',
 'ad680716-677f-11ee-95d4-4ab2673ea3f0',
 'ad68073e-677f-11ee-95d4-4ab2673ea3f0']

In [9]:
# values are the text that needs to be cleaned
documents[next(iter(documents))]

'Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected 

## 1. Creating a Custom Pipeline

The cleaning modules can be arranged in a custom order specified by the user. They will be executed in the order that they appear. The settings of each component can be modified by changing the settings of the associated object in the list. The current supported modules are:

    - SimpleCleaner
    - RemoveNonEnglishCleaner
    - SubstitutionCleaner
    - LemmatizeCleaner

Here is an example of a custom pipeline. This pipeline is used to first remove non english text, apply simple cleaning, and then apply lemmatization.

In [10]:
vulture = Vulture(n_jobs  = 1, 
                  verbose = 10,  # Disable == 0, Verbose >= 1
                 )

In [11]:
steps = [
    RemoveNonEnglishCleaner(ascii_ratio=0.9, stopwords_ratio=0.25),
    SimpleCleaner(stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
                  order = [
                      'standardize_hyphens',
                      'isolate_frozen',
                      'remove_copyright_statement',
                      'remove_stop_phrases',
                      'make_lower_case',
                      'remove_formulas',
                      'normalize',
                      'remove_next_line',
                      'remove_email',
                      'remove_()',
                      'remove_[]',
                      'remove_special_characters',
                      'remove_nonASCII_boundary',
                      'remove_nonASCII',
                      'remove_tags',
                      'remove_stop_words',
                      'remove_standalone_numbers',
                      'remove_extra_whitespace',
                      'min_characters',
                  ]
                 ),
    LemmatizeCleaner('spacy'),
]

In [12]:
%time cleaned_documents = vulture.clean(documents, steps=steps)

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/3 [00:00<?, ?it/s][Vulture]: Running RemoveNonEnglishCleaner module

100%|███████████████████████████████████████████| 9/9 [00:00<00:00, 1207.57it/s][A
[Vulture]: Running SimpleCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 167.21it/s][A
[Vulture]: Running LemmatizeCleaner module

  0%|                                                     | 0/9 [00:00<?, ?it/s][A
 11%|█████                                        | 1/9 [00:01<00:11,  1.49s/it][A
100%|█████████████████████████████████████████████| 9/9 [00:01<00:00,  5.53it/s][A
100%|█████████████████████████████████████████████| 3/3 [00:01<00:00,  1.76it/s]

CPU times: user 1.25 s, sys: 412 ms, total: 1.66 s
Wall time: 1.72 s





In [13]:
key = next(iter(documents))
documents[key], cleaned_documents[key]

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

## 2. Customizing Components

### SimpleCleaner

#### Setting Order of Operations

For the default patterns, the ```order``` parameter can be used to specify the order of operations within the SimpleCleaner component. The following examples shows how 5 simple operations can be applied in sequential order.

In [14]:
steps = [
    SimpleCleaner(stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
                  order = [
                      'make_lower_case',
                      'remove_()',
                      'remove_stop_words',
                      'remove_extra_whitespace',
                      'min_characters',
                  ]
                 ),
]

In [15]:
%time cleaned_documents = vulture.clean(documents, steps=steps)

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 217.31it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 21.57it/s]

CPU times: user 48.2 ms, sys: 3.02 ms, total: 51.2 ms
Wall time: 49.1 ms





In [16]:
key = next(iter(documents))
documents[key], cleaned_documents[key]

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

#### Setting Min Number of Characters

The number of minimum characters to keep per token is a class attribute. This example shows how it can be changed.

In [17]:
steps = [
    SimpleCleaner(min_characters = 12,
                  order = ['min_characters']
                 ),
]

In [18]:
%time cleaned_documents = vulture.clean(documents, steps=steps)

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module

100%|███████████████████████████████████████████| 9/9 [00:00<00:00, 8598.80it/s][A
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 225.27it/s]

CPU times: user 6.96 ms, sys: 2.88 ms, total: 9.84 ms
Wall time: 7.86 ms





In [19]:
key = next(iter(documents))
documents[key], cleaned_documents[key]

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

#### Custom RegEx

The SimpleCleaner component allows the user to specify custom regular expressions patterns. This enables Vulture to be applied to specific datasets while maintaining the parallel and distributed capabilities. In the following example, we will define a simple regular expression to detect and remove acronyms in the text. An acronym will be assumed to be a series of one or more letters enclosed in parentheses. 

In [20]:
custom_patterns = {
    'remove_acronyms': (
        r"\s*\([A-Z]+\)\s*",  # pattern to detect uppercase letters enclosed by parentheses
        ' *** '               # the string to replace matches with. 
                              # the stars are used for the example to make the pattern stand out. 
                              # to fully remove the acronyms, an empty string can be used
    )
}

In [21]:
steps = [
    SimpleCleaner(order = ['remove_acronyms'],  # only perform the custom operation
                  custom_patterns = custom_patterns
                 ),
]

In [22]:
%time cleaned_documents = vulture.clean(documents, steps=steps)

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module

100%|██████████████████████████████████████████| 9/9 [00:00<00:00, 12296.01it/s][A
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 209.08it/s]

CPU times: user 6.5 ms, sys: 3.07 ms, total: 9.57 ms
Wall time: 7.64 ms





In [23]:
key = next(iter(documents))
documents[key], cleaned_documents[key]

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

### RemoveNonEnglishCleaner

This component is used to detect and remove non-english text from the dataset. The detection works through a simple heuristics. This heuristic uses two components: the ratio of ASCII characters to all characters and the ratio of stopwords to all words. While not flawless, this heuristic (with appropriate hyperparameters) provides a quick way to filter out foreign language documents.  

##### Define a placeholder Spanish document for Non-English detection

In [24]:
# This placeholder text tells a short story about a young girl named Ana who lives in a small 
# town surrounded by mountains and finds an old book filled with stories of adventure and magic.
foreign_text = 'En un pequeño pueblo rodeado de montañas, vivía una joven llamada Ana. Todos los días, ' \
               'ella caminaba por los senderos del bosque, disfrutando de la tranquilidad y la belleza de ' \
               'la naturaleza. Un día, mientras exploraba una parte del bosque que no conocía, Ana encontró ' \
               'un antiguo libro cubierto de polvo y hojas. Curiosa, lo abrió y comenzó a leer. Las páginas ' \
               'contenían historias de aventuras, magia y lugares lejanos. Ana se sintió inspirada por estas ' \
               'historias y decidió que un día ella también viviría sus propias aventuras.'

In [25]:
foreign_documents = {0: foreign_text}

In [26]:
steps = [
    RemoveNonEnglishCleaner(ascii_ratio=0.9, stopwords_ratio=0.25),
]

In [27]:
%time cleaned_documents = vulture.clean(foreign_documents, steps=steps)

[Vulture]: Cleaning 1 documents
  0%|                                                     | 0/1 [00:00<?, ?it/s][Vulture]: Running RemoveNonEnglishCleaner module

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2870.84it/s][A
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 250.44it/s]

CPU times: user 6.11 ms, sys: 2.99 ms, total: 9.1 ms
Wall time: 7.05 ms





In [28]:
key = next(iter(foreign_documents))
foreign_documents[key], cleaned_documents[key]

('En un pequeño pueblo rodeado de montañas, vivía una joven llamada Ana. Todos los días, ella caminaba por los senderos del bosque, disfrutando de la tranquilidad y la belleza de la naturaleza. Un día, mientras exploraba una parte del bosque que no conocía, Ana encontró un antiguo libro cubierto de polvo y hojas. Curiosa, lo abrió y comenzó a leer. Las páginas contenían historias de aventuras, magia y lugares lejanos. Ana se sintió inspirada por estas historias y decidió que un día ella también viviría sus propias aventuras.',
 '')

### SubstitutitionCleaner

Substitutions allow Vulture to find specific substrings in the text and replaces them with some other substring. The intent of this component is to provide a Subject Matter Expert (SME) the ability to  normalize important terms in the vocabulary. Multiple terms can be used to represent the same concept. Alternatively, multiple tokens can represent a single term, the meaning of which will be destroyed through tokenization. Substitution can normalize these terms and also signal to Vulture that these terms do not require cleaning. This means that this component can also be used to "freeze" terms through a reflective substitution, leaving them unchanged throughout the cleaning process.

Substitution can be added as argument to the clean function. This will perform substitution before and after cleaning the text. However, the SubstitutionCleaner can be used like any of the other components as part of the pipeline. This finer-grained control allows the user to specify when substitution occurs and how the substitution map is handled. The SubstitutionCleaner has three attributes that control how the substitution map is processed

In [29]:
substitution_map = {
    'Foo Bar Bat': 'foo_bar',
    'testing': 'test'
}

#### Permute

This attribute permutes the substitution map such that whitespace is replaced with hyphens and vice versa

In [30]:
sub = SubstitutionCleaner(substitution_map, permute=True)
sub.substitution_map

{'Foo Bar Bat': 'foo_bar',
 'Foo-Bar Bat': 'foo_bar',
 'Foo Bar-Bat': 'foo_bar',
 'testing': 'test',
 'Foo-Bar-Bat': 'foo_bar'}

#### Lower 

This attribute converts the keys of the substitution map to lowercase

In [31]:
sub = SubstitutionCleaner(substitution_map, lower=True)
sub.substitution_map

{'Foo Bar Bat': 'foo_bar', 'foo bar bat': 'foo_bar', 'testing': 'test'}

#### Lemmatize 

This attribute lemmatizes the keys of the substitution map

In [32]:
sub = SubstitutionCleaner(substitution_map, lemmatize=True)
sub.substitution_map

{'Foo Bar Bat': 'foo_bar', 'testing': 'test', 'test': 'test'}

#### All Together

In [33]:
sub = SubstitutionCleaner(substitution_map, permute=True, lower=True, lemmatize=True)
sub.substitution_map

{'Foo Bar Bat': 'foo_bar',
 'foo bar bat': 'foo_bar',
 'Foo-Bar Bat': 'foo_bar',
 'Foo Bar-Bat': 'foo_bar',
 'foo-bar bat': 'foo_bar',
 'foo bar-bat': 'foo_bar',
 'testing': 'test',
 'test': 'test',
 'Foo-Bar-Bat': 'foo_bar',
 'foo-bar-bat': 'foo_bar'}

### LemmatizeCleaner

#### Apply NLTK Lemmatization

In [34]:
steps = [
    LemmatizeCleaner('nltk')
]

In [35]:
%time cleaned_documents = vulture.clean(documents, steps=steps)

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/1 [00:00<?, ?it/s][Vulture]: Running LemmatizeCleaner module

  0%|                                                     | 0/9 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 9/9 [00:01<00:00,  6.98it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.29s/it]

CPU times: user 1.18 s, sys: 101 ms, total: 1.28 s
Wall time: 1.3 s





In [36]:
key = next(iter(documents))
documents[key], cleaned_documents[key]

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

#### Apply Spacy Lemmatization

In [37]:
steps = [
    LemmatizeCleaner('spacy')
]

In [38]:
%time cleaned_documents = vulture.clean(documents, steps=steps)

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/1 [00:00<?, ?it/s][Vulture]: Running LemmatizeCleaner module

  0%|                                                     | 0/9 [00:00<?, ?it/s][A
 11%|█████                                        | 1/9 [00:01<00:10,  1.30s/it][A
100%|█████████████████████████████████████████████| 9/9 [00:01<00:00,  5.96it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.51s/it]

CPU times: user 1.16 s, sys: 325 ms, total: 1.48 s
Wall time: 1.52 s





In [39]:
key = next(iter(documents))
documents[key], cleaned_documents[key]

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

## 3. Creating a New Component

For more complex text cleaning or specific datasets, it may be necessary to create a new Vulture cleaning component. This is possible by creating a new child class derived from ```VultureModuleBase```. The class needs to implement a ```__call__``` method. The following examples shows an implementation of a Vulture component class for stemming text.

### **WARNING:**
Each component class contains a ```frozen``` attribute, inherited from ```VultureModuleBase```. This attribute is a set of tokens that have been substituted and need to be preserved throughout the cleaning. A new component implementation must have measures in place to leave tokens seen in ````frozen``` unchanged.  

### Define StemCleaner

This example implements a new component that will take the stems of the tokens

In [40]:
import warnings
from nltk.stem.snowball import SnowballStemmer

from TELF.pre_processing.Vulture.modules import VultureModuleBase


class StemCleaner(VultureModuleBase):
    """
    A text cleaner that normalizes tokens through stemming.
    """
    
    def __init__(self, frozen=None):
        super().__init__(frozen)
        self.backend = SnowballStemmer("english")
        
    def __call__(self, document):
        return self.run(document)
        
        
    def run(self, document):
        """
        Run the stemming

        Parameters
        ----------
        document: tuple
            A document id, document text pair for which to perform stemming

        Returns
        -------
        tuple
            Tuple of document id and cleaned text
        """
        doc_id, doc_text = document
        doc_text = self._stem(doc_text)
        return (doc_id, doc_text)
    
    
    def _stem(self, text):
        """
        Stem a given string

        Parameters
        ----------
        text: str
            A string to be stemmed

        Returns
        -------
        str
            The stemmed string
        """
        stemmed_tokens = []
        for t in text.split():
            
            # only stem tokens that are not in frozen
            stemmed_tokens.append(self.backend.stem(t) if t not in self.frozen else t)
        
        # return joined string
        return ' '.join(stemmed_tokens)

### Apply StemCleaner

#### Only Use Stemming

In [41]:
steps = [
    StemCleaner(),
]

In [42]:
%time cleaned_documents = vulture.clean(documents, steps=steps)

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/1 [00:00<?, ?it/s][Vulture]: Running StemCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 335.00it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 33.06it/s]

CPU times: user 32 ms, sys: 2.88 ms, total: 34.9 ms
Wall time: 33.2 ms





In [43]:
key = next(iter(documents))
documents[key], cleaned_documents[key]

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

#### Stemming & Substitution

In [44]:
substitutions = {
    'los alamos national laboratory': 'LANL',
    'lanl': 'LANL',
    'los alamos': 'Los_Alamos',
    'supervisory control and data acquisition': 'SCADA',
    'scada': 'SCADA',
}

In [45]:
%time cleaned_documents = vulture.clean(documents, steps=steps, substitutions=substitutions)

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/3 [00:00<?, ?it/s][Vulture]: Running SubstitutionCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 625.43it/s][A
[Vulture]: Running StemCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 340.91it/s][A
[Vulture]: Running SubstitutionCleaner module

100%|███████████████████████████████████████████| 9/9 [00:00<00:00, 5278.81it/s][A
100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 54.09it/s]

CPU times: user 1.88 s, sys: 682 ms, total: 2.56 s
Wall time: 2.76 s





In [46]:
key = next(iter(documents))
documents[key], cleaned_documents[key]

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected

#### Stemming & Substitution & Cleaning

In [47]:
steps = [
    SimpleCleaner(stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
                  order = [
                      'standardize_hyphens',
                      'isolate_frozen',
                      'remove_copyright_statement',
                      'remove_stop_phrases',
                      'make_lower_case',
                      'remove_formulas',
                      'normalize',
                      'remove_next_line',
                      'remove_email',
                      'remove_()',
                      'remove_[]',
                      'remove_special_characters',
                      'remove_nonASCII_boundary',
                      'remove_nonASCII',
                      'remove_tags',
                      'remove_stop_words',
                      'remove_standalone_numbers',
                      'remove_extra_whitespace',
                      'min_characters',
                  ]
                 ),
    StemCleaner(),
]

In [48]:
%time cleaned_documents = vulture.clean(documents, steps=steps, substitutions=substitutions)

[Vulture]: Cleaning 9 documents
  0%|                                                     | 0/4 [00:00<?, ?it/s][Vulture]: Running SubstitutionCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 833.71it/s][A
[Vulture]: Running SimpleCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 121.81it/s][A
[Vulture]: Running StemCleaner module

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 663.12it/s][A
 75%|█████████████████████████████████▊           | 3/4 [00:00<00:00, 27.09it/s][Vulture]: Running SubstitutionCleaner module

100%|███████████████████████████████████████████| 9/9 [00:00<00:00, 8100.59it/s][A
100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 34.34it/s]

CPU times: user 1.99 s, sys: 714 ms, total: 2.7 s
Wall time: 2.94 s





In [49]:
key = next(iter(documents))
documents[key], cleaned_documents[key]

('Supervisory Control and Data Acquisition (SCADA) systems often serve as the nervous system for substations within power grids. These systems facilitate real-time monitoring, data acquisition, control of equipment, and ensure smooth and efficient operation of the substation and its connected devices. As the dependence on these SCADA systems grows, so does the risk of potential malicious intrusions that could lead to significant outages or even permanent damage to the grid. Previous work has shown that dimensionality reduction-based approaches, such as Principal Component Analysis (PCA), can be used for accurate identification of anomalies in SCADA systems. While not specifically applied to SCADA, non-negative matrix factorization (NMF) has shown strong results at detecting anomalies in wireless sensor networks. These unsupervised approaches model the normal or expected behavior and detect the unseen types of attacks or anomalies by identifying the events that deviate from the expected