In [1]:
import os
import pickle
import pathlib
import pandas as pd
from TELF.pre_processing import Vulture 
from TELF.pre_processing.Vulture.modules import SimpleCleaner
from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES
from TELF.pre_processing.Vulture.tokens_analysis.vocab_consolidator import VocabularyConsolidator

In [2]:
DATA_DIR = os.path.join('..', '..', 'data')
DATA_DIR = pathlib.Path(DATA_DIR).resolve()
DATA_FILE = 'sample.csv'
df_path = os.path.join(DATA_DIR,DATA_FILE)
df = pd.read_csv(df_path)
documents = df.abstract.to_dict()

In [3]:
vulture = Vulture(n_jobs  = 1, 
                  verbose = 10,  # Disable == 0, Verbose >= 1
                 )
steps = [SimpleCleaner( stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
            order = [
                'standardize_hyphens',
                'remove_stop_phrases',
                'isolate_frozen',
                'remove_copyright_statement',
                'make_lower_case',
                'remove_formulas',
                'normalize',
                'remove_next_line',
                'remove_email',
                'remove_()',
                'remove_[]',
                'remove_special_characters',
                'remove_nonASCII_boundary',
                'remove_nonASCII',
                'remove_tags',
                'remove_stop_words',
                'remove_standalone_numbers',
                'remove_extra_whitespace',
                'min_characters',
        ])]
vulture.clean(  documents, 
                steps=steps,
                save_path=os.path.join(DATA_DIR, "clean_documents"))         

clean_documents = pickle.load(
                        open(os.path.join(DATA_DIR, "clean_documents"), 'rb')
                        )

[Vulture]: Cleaning 940 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module
100%|██████████| 940/940 [00:01<00:00, 508.09it/s]
100%|██████████| 1/1 [00:01<00:00,  1.85s/it]


In [7]:
consolidator = VocabularyConsolidator()
changes_made_file = 'VOCAB_CONSOLIDATOR_changes.csv'
o = consolidator.consolidate_terms( vocabulary=None,
                                    texts=clean_documents,
                                    changes_made_save_path=os.path.join(DATA_DIR, changes_made_file),
                                    operated_text_save_path=os.path.join(DATA_DIR, 'VOCAB_CONSOLIDATOR'))

Processing Chunks: 100%|██████████| 192/192 [00:00<00:00, 2275.80it/s]
[Vulture]: Cleaning 940 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running SubstitutionOperator module
[Parallel(n_jobs=192)]: Using backend MultiprocessingBackend with 192 concurrent workers.
[Parallel(n_jobs=192)]: Done  88 out of 235 | elapsed:    1.4s remaining:    2.3s
[Parallel(n_jobs=192)]: Done 235 out of 235 | elapsed:    1.7s finished
100%|██████████| 1/1 [00:01<00:00,  1.88s/it]


In [5]:
df_changed = pd.read_csv(os.path.join(DATA_DIR, changes_made_file))

In [6]:
df_changed.head(60)

Unnamed: 0,Previous Key,New Key,Similarity Score
0,grid,grids,0.8
1,networks,network,0.875
2,grid,grids,0.8
3,networks,network,0.875
4,builds,build,0.833333
5,computer,computed,0.875
6,label,labels,0.833333
7,mutations,mutational,0.8
8,reduces,reduced,0.857143
9,specimens,specimen,0.888889
