In [1]:
import re
import pickle
import string
import copy
from pathlib import Path

import pandas as pd
import numpy as np
import spacy
import requests
import feedparser
from bs4 import BeautifulSoup
import py_stringmatching as sm 

5. Match "PRODUCT" type entities to CPE list

In [2]:
data_path = Path("../data")

In [3]:
df_cpe = pd.read_feather('../data/cpe_node_data.feather')
df_cpe.head()

Unnamed: 0,cpe,cve_id
0,zephyrproject_zephyr,CVE-2020-10019
1,,CVE-2020-10019
2,zephyrproject_zephyr,CVE-2020-10021
3,,CVE-2020-10021
4,zephyrproject_zephyr,CVE-2020-10022


In [4]:
df = pd.read_pickle("../alerts/alerts_df.pkl")

In [5]:
# alert to entity edges
df_ner = df[["alert_id", "ner2"]].explode("ner2")

# Convert "NA" values
df_ner["ner2"] = (
    df_ner["ner2"]
    .fillna("")
    .apply(lambda x: [str(x[0]), x[1]] if isinstance(x, list) else [str(x), "UNK"])
)

# Unnest the list of labels and types
df_ner[["label", "type"]] = pd.DataFrame(df_ner.ner2.to_list(), index=df_ner.index)
df_ner = df_ner.reset_index(level=0)
df_ner = df_ner.drop(columns="ner2").drop_duplicates()

# Dedupe separate NER types - ENTITY MATCHING TIME
# RENAMING NORP TO GPE (ie American/Iranian ~= America/Iran)
type_col = "mod_type"
label_col = "label"
df_ner[type_col] = df_ner["type"]
df_ner.loc[df_ner[type_col] == "NORP", "mod_type"] = "GPE"


# Remove stop words to help dedupe
with open(data_path / "stopwords.txt", "r") as stop_file:
    stop_words = pd.Series(stop_file.readlines()).str.strip()
    # stop_words = stop_words[
    #     stop_words.str.len() > 1
    # ]  # Don't want to remove acronyms and such. Using this as a blunt way to do that.

for stop_word in stop_words:
    df_ner[label_col] = (
        df_ner[label_col]
        .str.replace(f"\s{stop_word}\s", " ", regex=True)
        .str.replace("  +", " ", regex=True)
    )

# Clean up company names for eas(ier) removal later
company_types = ["inc", "llc", "ac", "corp", "co", "ltd", "corporation"]
df_ner[label_col] = df_ner[label_col].str.replace("\s\(.*\)*\s*", "", regex=True)
for co_type in company_types:
    df_ner[label_col] = df_ner[label_col].str.replace(f"\s?{co_type}$", "", regex=True)


# Remove open ended parens and brackets in names
open_group = re.compile("(\(|\[)[^\)\]]*$")  # This only works for single line text...
df_ner[label_col] = (
    df_ner[label_col]
    .apply(lambda x: open_group.sub("", x) if isinstance(x, str) else x)
    .str.strip()
)

# Clean out labels that are meaningless
punctuation = re.compile("[%s]" % re.escape(string.punctuation))
df_ner[label_col] = df_ner[label_col].str.replace(punctuation, "")
df_ner = df_ner[df_ner[label_col].str.len() > 1]

In [6]:
df_product = df_ner[df_ner['type'] == 'PRODUCT']
df_product = df_product[['label']].drop_duplicates().dropna()
df_product

Unnamed: 0,label
17,powershell
19,windows defender
20,rclone
21,meganz
25,esxi
...,...
13719,jaxb
13720,java runtime environment
13728,javadisable java
13729,firefox thunderbird


In [7]:
df_cpe = df_cpe.drop(columns='cve_id').drop_duplicates().dropna()
df_cpe['cpe'] = df_cpe['cpe'].str.replace('_',' ')
df_cpe

Unnamed: 0,cpe
0,zephyrproject zephyr
1,
9,metalgenix genixcms
34,bluetooth bluetooth core
36,opensuse leap
...,...
45542,eyunjing yjcms
45548,caehealthcare learningspace enterprise
45549,drachtio drachtio-server
45554,qpress project qpress


# string match

In [8]:
# create a qgram tokenizer using q=3
qg3_tok = sm.QgramTokenizer(qval=3)

# create a whitespace tokenizer
ws_tok = sm.WhitespaceTokenizer()

#create alphabetic tokenizer
al_tok = sm.AlphabeticTokenizer()

In [9]:
# create a Jaccard similarity measure object
jac = sm.Jaccard()

# create a Levenshtein similarity measure object
lev = sm.Levenshtein()

In [10]:
# get tokens of all cpe and product
toki = ws_tok
cpes = df_cpe['cpe'].to_list()
products = df_product['label'].to_list()
iid = list(range(len(cpes)))

products_token = {}
products_len = {}
cpes_token = {}
cpes_len = {}
for i in range(len(cpes)):
    cpes_token[i] = toki.tokenize(cpes[i])
    cpes_len[i] = len(cpes_token[i])
for i in range(len(products)):
    products_token[i] = toki.tokenize(products[i])
    products_len[i] = len(products_token[i])

In [11]:
#get some matches
matches=[]
t=0.5
for i in iid:
    this_cpe_matches = []
    cpe_tok = cpes_token[i]
    
    #calculate boundaries for size
    cpes_len_h = cpes_len[i]*(1/t)
    cpes_len_l = cpes_len[i]*t
    
    #prune down the products to check based on t
    oids = [x for x in products_len if (products_len[x] >= cpes_len_l) & (products_len[x] <= cpes_len_h)]
    
    for j in oids:
        product_tok = products_token[j]
        jsim = jac.get_sim_score(cpe_tok, product_tok)
        #lsim = lev.get_sim_score(cpe_tok, product_tok)
        
        if jsim >= t:
            #append the potential match
            if products[j] not in this_cpe_matches:
                this_cpe_matches.append(products[j]) 
    
    #append the matches found for this cpe
    matches.append(this_cpe_matches)
    
df_cpe['product_match'] = matches

In [12]:
df_cpe = df_cpe[["cpe", "product_match"]].explode("product_match")
df_cpe=df_cpe.drop_duplicates().dropna()
df_cpe = (
    df_cpe.drop_duplicates()
    .dropna()
    .reset_index(drop=True)
    .rename(columns={'product_match': 'ner'})
)

In [13]:
df_cpe.to_csv('../data/cpe_product_edge.csv')