In [1]:
import os 
import ast
import pickle
import pathlib
import numpy as np
import pandas as pd

from TELF.applications import Cheetah
from TELF.pre_processing import Vulture

## Load Data

In [2]:
DATA_PATH = os.path.join('..', '..', 'data')
DATA_FILE = 'sample.csv'

In [3]:
df = pd.read_csv(os.path.join(DATA_PATH, DATA_FILE))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      843 non-null    object
 7   clean_abstract  940 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.9+ KB


In [4]:
df.head(5)

Unnamed: 0,eid,title,year,abstract,authors,author_ids,references,clean_abstract
0,3cbdf82a-6781-11ee-b983-4ab2673ea3f0,Paper Title,2016,Supervisory Control and Data Acquisition (SCAD...,Name;Name;Name;Name;Name,3df61b32-6781-11ee-b983-4ab2673ea3f0;3df61c18-...,3cbe2bec-6781-11ee-b983-4ab2673ea3f0;3cbe6d64-...,supervisory control acquisition system often s...
1,3cbdf960-6781-11ee-b983-4ab2673ea3f0,Paper Title,2002,Supervisory Control and Data Acquisition (SCAD...,Name;Name,3df61cc2-6781-11ee-b983-4ab2673ea3f0;3df61cea-...,3cbe45d2-6781-11ee-b983-4ab2673ea3f0;3cbe0338-...,supervisory control acquisition system often s...
2,3cbdf992-6781-11ee-b983-4ab2673ea3f0,Paper Title,1995,Malware is one of the most dangerous and costl...,Name;Name;Name;Name,3df61d1c-6781-11ee-b983-4ab2673ea3f0;3df61d3a-...,3cbe5c8e-6781-11ee-b983-4ab2673ea3f0;3cbe1bac-...,malware dangerous costly cyber threat organiza...
3,3cbdf9ba-6781-11ee-b983-4ab2673ea3f0,Paper Title,2003,Malware is one of the most dangerous and costl...,Name;Name;Name;Name;Name;Name;Name;Name;Name;Name,3df61dbc-6781-11ee-b983-4ab2673ea3f0;3df61dda-...,3cbe0040-6781-11ee-b983-4ab2673ea3f0;3cbe08e2-...,malware dangerous costly cyber threat national...
4,3cbdf9e2-6781-11ee-b983-4ab2673ea3f0,Paper Title,1997,"We propose an efficient, distributed, out-of-m...",Name;Name;Name;Name;Name;Name;Name;Name;Name;Name,3df61f24-6781-11ee-b983-4ab2673ea3f0;3df61f56-...,,propose efficient distribute memory implementa...


## Create Clean Text Columns for Title & Abstract prior to Using Cheetah

In [5]:
vulture = Vulture(n_jobs  = 1, 
                  verbose = 0,  # Disable == 0, Verbose >= 1
                 )

In [6]:
df = vulture.clean_dataframe(df, concat_cleaned_cols=False, append_to_original_df=True, columns=["title", "abstract"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      843 non-null    object
 7   clean_abstract  940 non-null    object
 8   clean_title     940 non-null    object
dtypes: int64(1), object(8)
memory usage: 66.2+ KB


In [7]:
# if remove_english=True, this will remove non-english documents
df.dropna(subset=['clean_title', 'clean_abstract'], inplace=True)

## Use Cheetah

### Examine the Data

In [8]:
# setup the cheetah columns. the keys are cheetah categories and the values are corresponding column names
# not all columns need to be present, missing columns will not be indexed
cheetah_columns = {
    'title': 'clean_title', 
    'abstract': 'clean_abstract',
    'year': 'year',
    'author_ids': 'author_ids',
}

**Note:**
- The key is the name for this column known to Cheetah. The value is the corresponding column in the DataFrame. For example, for a column called clean_abstract, an entry into columns would like **'abstract': 'clean_abstract'**.
- If any of the above columns are missing, Cheetah will not attempt to index the missing column. If the user attempts to perform search on an information for a missing column, error will be raised.
- Although any of the above columns could be missing, they must follow the correct format if they do exist. *See below for the correct formats.*

**Note that under author_ids column, each ID is seperated with ";" symbol**:

In [9]:
df.iloc[0][cheetah_columns["author_ids"]]

'3df61b32-6781-11ee-b983-4ab2673ea3f0;3df61c18-6781-11ee-b983-4ab2673ea3f0;3df61c4a-6781-11ee-b983-4ab2673ea3f0;3df61c72-6781-11ee-b983-4ab2673ea3f0;3df61c9a-6781-11ee-b983-4ab2673ea3f0'

### Add some NaNs

Cheetah should be able to handle missing data. If some papers have NaN entries for certain columns, the associated documents should not be indexed for those values.

In [10]:
def insert_random_nans(df, cols, perc=5, seed=42):
    """
    Introduce NaN values randomly into specified columns of a DataFrame.
    
    Parameters:
    -----------
    df: pd.Dataframe
        Target DataFrame
    cols: List 
        A list of columns in which NaNs will be introduced
    perc: int
        Percentage of the total dataframe length to be replaced with NaNs in each column
    seed: int
        Random seed for reproducibility
    
    Returns:
    --------
    pd.DataFrame
        Modified dataframe with random NaNs
    """
    rng = np.random.default_rng(seed) 
    num_nans = int(len(df) * perc / 100)
    for col in cols:
        nan_indices = rng.choice(df.index, size=num_nans, replace=False)
        df.loc[nan_indices, col] = np.nan
    return df

In [11]:
df = insert_random_nans(df, cols=['author_ids', 'year', 'abstract', 'title'], perc=5, seed=42)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   eid             940 non-null    object 
 1   title           893 non-null    object 
 2   year            893 non-null    float64
 3   abstract        893 non-null    object 
 4   authors         940 non-null    object 
 5   author_ids      893 non-null    object 
 6   references      843 non-null    object 
 7   clean_abstract  940 non-null    object 
 8   clean_title     940 non-null    object 
dtypes: float64(1), object(8)
memory usage: 66.2+ KB


### Searching

#### Create or load indices

In [12]:
cheetah = Cheetah(verbose=True)
index_file = os.path.join(DATA_PATH, 'sample_index.p')

cheetah.index(df, 
              columns=cheetah_columns, 
              index_file=index_file,
              reindex=True)

Overwriting existing index.
Indexing abstract


100%|██████████| 940/940 [00:00<00:00, 66605.50it/s]
100%|██████████| 517/517 [00:00<00:00, 446000.65it/s]


Indexing title


100%|██████████| 940/940 [00:00<00:00, 2349610.11it/s]
100%|██████████| 1/1 [00:00<00:00, 33288.13it/s]


Indexing years


100%|██████████| 940/940 [00:00<00:00, 2790265.93it/s]


Indexing author IDs


100%|██████████| 940/940 [00:00<00:00, 216035.38it/s]
100%|██████████| 6730/6730 [00:00<00:00, 3235633.42it/s]


#### Set criteria and search

##### Search for single word

In [13]:
search_parameters = {
    "query": "scada",
    "in_title":True,         # if true searches for query title
    "in_abstract":True,      # if true searches for query in abstract
    "do_results_table": True # if true, generate DataFrame for explainability
}

search_results, search_criteria = cheetah.search(**search_parameters)
search_results.head(5)

Found 113 papers in 0.001 seconds


Unnamed: 0,eid,title,year,abstract,authors,author_ids,references,clean_abstract,clean_title
0,3cbdf82a-6781-11ee-b983-4ab2673ea3f0,Paper Title,2016.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name;Name;Name;Name,3df61b32-6781-11ee-b983-4ab2673ea3f0;3df61c18-...,3cbe2bec-6781-11ee-b983-4ab2673ea3f0;3cbe6d64-...,supervisory control acquisition scada serve ne...,title
1,3cbdf960-6781-11ee-b983-4ab2673ea3f0,Paper Title,2002.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name,3df61cc2-6781-11ee-b983-4ab2673ea3f0;3df61cea-...,3cbe45d2-6781-11ee-b983-4ab2673ea3f0;3cbe0338-...,supervisory control acquisition scada serve ne...,title
770,3cbe6666-6781-11ee-b983-4ab2673ea3f0,Paper Title,2018.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name,,3cbe5bb2-6781-11ee-b983-4ab2673ea3f0;3cbe7232-...,supervisory control acquisition scada serve ne...,title
518,3cbe42bc-6781-11ee-b983-4ab2673ea3f0,Paper Title,1997.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name;Name;Name;Name;Name,3df891dc-6781-11ee-b983-4ab2673ea3f0;3df89204-...,3cbe504a-6781-11ee-b983-4ab2673ea3f0;3cbe7ac0-...,supervisory control acquisition scada serve ne...,title
774,3cbe66f2-6781-11ee-b983-4ab2673ea3f0,,2009.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name,3df7a10a-6781-11ee-b983-4ab2673ea3f0;3df98cc2-...,,supervisory control acquisition scada serve ne...,title


In [14]:
search_criteria

Unnamed: 0,filter_type,filter_value,num_papers,included_ids
0,query,scada,113,0;1;770;518;774;9;521;11;779;780;526;273;786;2...


##### Search for single phrase

In [15]:
# search in a small window (0 words between the words in the query)
search_parameters = {
    "query": "cyber threats",
    "in_title": True,        # if true, searches for query title
    "in_abstract": True,     # if true, searches for query in abstract
    "ngram_window_size": 2,  # search substrings of this size
    "ngram_ordered": False,  # if true, preserves order of tokens in query
    "do_results_table": True # if true, generate DataFrame for explainability
}

search_results, search_criteria = cheetah.search(**search_parameters)
search_results.head(5)

Found 208 papers in 0.0021 seconds


Unnamed: 0,eid,title,year,abstract,authors,author_ids,references,clean_abstract,clean_title
2,3cbdf992-6781-11ee-b983-4ab2673ea3f0,Paper Title,1995.0,Malware is one of the most dangerous and costl...,Name;Name;Name;Name,3df61d1c-6781-11ee-b983-4ab2673ea3f0;3df61d3a-...,3cbe5c8e-6781-11ee-b983-4ab2673ea3f0;3cbe1bac-...,malware dangerous costly cyber threats organiz...,title
515,3cbe424e-6781-11ee-b983-4ab2673ea3f0,,,Malware is one of the most dangerous and costl...,Name;Name,3df89074-6781-11ee-b983-4ab2673ea3f0;3df8909c-...,3cbe0612-6781-11ee-b983-4ab2673ea3f0,malware dangerous costly cyber threats nationa...,title
3,3cbdf9ba-6781-11ee-b983-4ab2673ea3f0,Paper Title,2003.0,Malware is one of the most dangerous and costl...,Name;Name;Name;Name;Name;Name;Name;Name;Name;Name,3df61dbc-6781-11ee-b983-4ab2673ea3f0;3df61dda-...,3cbe0040-6781-11ee-b983-4ab2673ea3f0;3cbe08e2-...,malware dangerous costly cyber threats nationa...,title
6,3cbdfa32-6781-11ee-b983-4ab2673ea3f0,Paper Title,,Malware is one of the most dangerous and costl...,Name;Name;Name;Name;Name,3df62348-6781-11ee-b983-4ab2673ea3f0;3df62370-...,3cbe37ea-6781-11ee-b983-4ab2673ea3f0;3cbe789a-...,malware dangerous costly cyber threats nationa...,title
520,3cbe4302-6781-11ee-b983-4ab2673ea3f0,Paper Title,,Malware is one of the most dangerous and costl...,Name;Name,3df833a4-6781-11ee-b983-4ab2673ea3f0;3df89326-...,3cbe4bcc-6781-11ee-b983-4ab2673ea3f0;3cbe7b74-...,malware dangerous costly cyber threats nationa...,title


In [16]:
search_criteria

Unnamed: 0,filter_type,filter_value,num_papers,included_ids
0,query,cyber threats,208,2;515;3;6;520;12;13;530;20;532;543;33;548;38;5...


##### Intersect search for multiple words/phrases

In [17]:
# search in a small window (max 8 words between the words in the query)
search_parameters = {
    "query": ["cyber threats", "malware"],
    "and_search": True,      # if true, intersect search results for multiple queries
    "in_title": True,        # if true, searches for query title
    "in_abstract": True,     # if true, searches for query in abstract
    "ngram_window_size": 5,  # search substrings of this size
    "ngram_ordered": True,   # if true, preserves order of tokens in query
    "do_results_table": True # if true, generate DataFrame for explainability
}

search_results, search_criteria = cheetah.search(**search_parameters)
search_results.head(5)

Found 208 papers in 0.0018 seconds


Unnamed: 0,eid,title,year,abstract,authors,author_ids,references,clean_abstract,clean_title
2,3cbdf992-6781-11ee-b983-4ab2673ea3f0,Paper Title,1995.0,Malware is one of the most dangerous and costl...,Name;Name;Name;Name,3df61d1c-6781-11ee-b983-4ab2673ea3f0;3df61d3a-...,3cbe5c8e-6781-11ee-b983-4ab2673ea3f0;3cbe1bac-...,malware dangerous costly cyber threats organiz...,title
3,3cbdf9ba-6781-11ee-b983-4ab2673ea3f0,Paper Title,2003.0,Malware is one of the most dangerous and costl...,Name;Name;Name;Name;Name;Name;Name;Name;Name;Name,3df61dbc-6781-11ee-b983-4ab2673ea3f0;3df61dda-...,3cbe0040-6781-11ee-b983-4ab2673ea3f0;3cbe08e2-...,malware dangerous costly cyber threats nationa...,title
515,3cbe424e-6781-11ee-b983-4ab2673ea3f0,,,Malware is one of the most dangerous and costl...,Name;Name,3df89074-6781-11ee-b983-4ab2673ea3f0;3df8909c-...,3cbe0612-6781-11ee-b983-4ab2673ea3f0,malware dangerous costly cyber threats nationa...,title
6,3cbdfa32-6781-11ee-b983-4ab2673ea3f0,Paper Title,,Malware is one of the most dangerous and costl...,Name;Name;Name;Name;Name,3df62348-6781-11ee-b983-4ab2673ea3f0;3df62370-...,3cbe37ea-6781-11ee-b983-4ab2673ea3f0;3cbe789a-...,malware dangerous costly cyber threats nationa...,title
520,3cbe4302-6781-11ee-b983-4ab2673ea3f0,Paper Title,,Malware is one of the most dangerous and costl...,Name;Name,3df833a4-6781-11ee-b983-4ab2673ea3f0;3df89326-...,3cbe4bcc-6781-11ee-b983-4ab2673ea3f0;3cbe7b74-...,malware dangerous costly cyber threats nationa...,title


In [18]:
search_criteria

Unnamed: 0,filter_type,filter_value,num_papers,included_ids
0,query,cyber threats,208,2;515;3;6;520;12;13;530;20;532;543;33;548;38;5...
1,query,malware,306,2;3;6;12;13;17;20;30;33;34;38;41;43;47;51;53;5...


##### Union search for multiple words/phrases

In [19]:
# search in a small window (max 8 words between the words in the query)
search_parameters = {
    "query": ["cyber threats", "malware"],
    "and_search": False,     # if true, intersect search results for multiple queries
    "in_title": True,        # if true, searches for query title
    "in_abstract": True,     # if true, searches for query in abstract
    "ngram_window_size": 5,  # search substrings of this size
    "ngram_ordered": True,   # if true, preserves order of tokens in query
    "do_results_table": True # if true, generate DataFrame for explainability
}

search_results, search_criteria = cheetah.search(**search_parameters)
search_results.head(5)

Found 306 papers in 0.0019 seconds


Unnamed: 0,eid,title,year,abstract,authors,author_ids,references,clean_abstract,clean_title
2,3cbdf992-6781-11ee-b983-4ab2673ea3f0,Paper Title,1995.0,Malware is one of the most dangerous and costl...,Name;Name;Name;Name,3df61d1c-6781-11ee-b983-4ab2673ea3f0;3df61d3a-...,3cbe5c8e-6781-11ee-b983-4ab2673ea3f0;3cbe1bac-...,malware dangerous costly cyber threats organiz...,title
3,3cbdf9ba-6781-11ee-b983-4ab2673ea3f0,Paper Title,2003.0,Malware is one of the most dangerous and costl...,Name;Name;Name;Name;Name;Name;Name;Name;Name;Name,3df61dbc-6781-11ee-b983-4ab2673ea3f0;3df61dda-...,3cbe0040-6781-11ee-b983-4ab2673ea3f0;3cbe08e2-...,malware dangerous costly cyber threats nationa...,title
6,3cbdfa32-6781-11ee-b983-4ab2673ea3f0,Paper Title,,Malware is one of the most dangerous and costl...,Name;Name;Name;Name;Name,3df62348-6781-11ee-b983-4ab2673ea3f0;3df62370-...,3cbe37ea-6781-11ee-b983-4ab2673ea3f0;3cbe789a-...,malware dangerous costly cyber threats nationa...,title
12,3cbdfb18-6781-11ee-b983-4ab2673ea3f0,Paper Title,2000.0,Malware is one of the most dangerous and costl...,Name,3df62f82-6781-11ee-b983-4ab2673ea3f0,3cbe0a68-6781-11ee-b983-4ab2673ea3f0;3cbe2138-...,malware dangerous costly cyber threats organiz...,title
13,3cbdfb36-6781-11ee-b983-4ab2673ea3f0,Paper Title,2003.0,Malware is one of the most dangerous and costl...,Name;Name;Name;Name;Name;Name,3df62faa-6781-11ee-b983-4ab2673ea3f0;3df62fd2-...,3cbe07a2-6781-11ee-b983-4ab2673ea3f0;3cbe3cae-...,malware dangerous costly cyber threats nationa...,title


In [20]:
search_criteria

Unnamed: 0,filter_type,filter_value,num_papers,included_ids
0,query,cyber threats,208,2;515;3;6;520;12;13;530;20;532;543;33;548;38;5...
1,query,malware,306,2;3;6;12;13;17;20;30;33;34;38;41;43;47;51;53;5...


##### Use Negative Prompts in Search

Dictionaries in the search query serve as negative prompts. This means that a given term will be looked up but results must exclude the word(s) in the negative prompt. The negative prompt should only contain unigrams

In [21]:
search_parameters = {
    "query": {"scada": "malware"},
    "in_title":True,         # if true searches for query title
    "in_abstract":True,     # if true searches for query in abstract
    "do_results_table": True # if true, generate DataFrame for explainability
}

search_results, search_criteria = cheetah.search(**search_parameters)
search_results.head(5)

Found 113 papers in 0.0006 seconds


Unnamed: 0,eid,title,year,abstract,authors,author_ids,references,clean_abstract,clean_title
0,3cbdf82a-6781-11ee-b983-4ab2673ea3f0,Paper Title,2016.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name;Name;Name;Name,3df61b32-6781-11ee-b983-4ab2673ea3f0;3df61c18-...,3cbe2bec-6781-11ee-b983-4ab2673ea3f0;3cbe6d64-...,supervisory control acquisition scada serve ne...,title
1,3cbdf960-6781-11ee-b983-4ab2673ea3f0,Paper Title,2002.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name,3df61cc2-6781-11ee-b983-4ab2673ea3f0;3df61cea-...,3cbe45d2-6781-11ee-b983-4ab2673ea3f0;3cbe0338-...,supervisory control acquisition scada serve ne...,title
770,3cbe6666-6781-11ee-b983-4ab2673ea3f0,Paper Title,2018.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name,,3cbe5bb2-6781-11ee-b983-4ab2673ea3f0;3cbe7232-...,supervisory control acquisition scada serve ne...,title
518,3cbe42bc-6781-11ee-b983-4ab2673ea3f0,Paper Title,1997.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name;Name;Name;Name;Name,3df891dc-6781-11ee-b983-4ab2673ea3f0;3df89204-...,3cbe504a-6781-11ee-b983-4ab2673ea3f0;3cbe7ac0-...,supervisory control acquisition scada serve ne...,title
774,3cbe66f2-6781-11ee-b983-4ab2673ea3f0,,2009.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name,3df7a10a-6781-11ee-b983-4ab2673ea3f0;3df98cc2-...,,supervisory control acquisition scada serve ne...,title


In [22]:
search_criteria

Unnamed: 0,filter_type,filter_value,num_papers,included_ids
0,query,scada,113,0;1;770;518;774;9;521;11;779;780;526;273;786;2...


In [23]:
search_parameters = {
    "query": {"scada": ["malware", "threat"]},
    "in_title":True,         # if true searches for query title
    "in_abstract":True,      # if true searches for query in abstract
    "do_results_table": True # if true, generate DataFrame for explainability
}

search_results, search_criteria = cheetah.search(**search_parameters)
search_results.head(5)

Found 113 papers in 0.0006 seconds


Unnamed: 0,eid,title,year,abstract,authors,author_ids,references,clean_abstract,clean_title
0,3cbdf82a-6781-11ee-b983-4ab2673ea3f0,Paper Title,2016.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name;Name;Name;Name,3df61b32-6781-11ee-b983-4ab2673ea3f0;3df61c18-...,3cbe2bec-6781-11ee-b983-4ab2673ea3f0;3cbe6d64-...,supervisory control acquisition scada serve ne...,title
1,3cbdf960-6781-11ee-b983-4ab2673ea3f0,Paper Title,2002.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name,3df61cc2-6781-11ee-b983-4ab2673ea3f0;3df61cea-...,3cbe45d2-6781-11ee-b983-4ab2673ea3f0;3cbe0338-...,supervisory control acquisition scada serve ne...,title
770,3cbe6666-6781-11ee-b983-4ab2673ea3f0,Paper Title,2018.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name,,3cbe5bb2-6781-11ee-b983-4ab2673ea3f0;3cbe7232-...,supervisory control acquisition scada serve ne...,title
518,3cbe42bc-6781-11ee-b983-4ab2673ea3f0,Paper Title,1997.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name;Name;Name;Name;Name,3df891dc-6781-11ee-b983-4ab2673ea3f0;3df89204-...,3cbe504a-6781-11ee-b983-4ab2673ea3f0;3cbe7ac0-...,supervisory control acquisition scada serve ne...,title
774,3cbe66f2-6781-11ee-b983-4ab2673ea3f0,,2009.0,Supervisory Control and Data Acquisition (SCAD...,Name;Name,3df7a10a-6781-11ee-b983-4ab2673ea3f0;3df98cc2-...,,supervisory control acquisition scada serve ne...,title


In [24]:
search_criteria

Unnamed: 0,filter_type,filter_value,num_papers,included_ids
0,query,scada,113,0;1;770;518;774;9;521;11;779;780;526;273;786;2...
