In [272]:
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                         pipeline)
import pandas as pd
import re
from itertools import product
import numpy as np
import pylcs
import psycopg2 as p2
from tqdm import tqdm
from Levenshtein import ratio
from typing import Dict, Optional, List, Tuple

pd.set_option('display.width', 20000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

dbname = "vulns_scanner"
user = 'postgres'
password = 'postgres'
host = 'localhost'
port = '5432'

# Алгоритм поиска и генерации конфигураций

## Шаг 1. Извлечение именованных сущностей

In [238]:
def get_df_from_bd(q: str,
                   dbname: str = "vulns_scanner",
                   user: str = 'postgres',
                   password: str = 'postgres',
                   host: str = 'localhost',
                   port: str = '5432') -> pd.DataFrame:
    """Выполняет результат переданного запроса к БД.
    """
    conn = p2.connect(dbname=dbname, user=user, password=password, host=host, port=port)
    cur = conn.cursor()
    cur.execute(q)
    colnames = [desc[0] for desc in cur.description]
    tuples = cur.fetchall()
    cur.close()
    df = pd.DataFrame(tuples, columns=colnames)
    return df

In [239]:
path_to_model = "/home/mikhail/Documents/pandan_study/vkr/vulns_scanner/mikhail_code/models/nuner_180525_full_dataset"
final_tokenizer = AutoTokenizer.from_pretrained(path_to_model, use_fast=True, add_prefix_space=True, local_files_only=True)
final_model = AutoModelForTokenClassification.from_pretrained(path_to_model, local_files_only=True)

In [240]:
# тестовый датасет для проверки алгоритма
df_test = pd.read_csv('df_200_not_in_stucco_v3_180525.csv')

In [241]:
def extract_ners(cve_text: str, 
                 tokenizer=final_tokenizer, 
                 model=final_model):
    """Применяет NER модель на тексте и возвращает найденные сущности и их вероятности. 
    """
    token_classifier = pipeline(
        "token-classification", model=final_model, aggregation_strategy="first", tokenizer=final_tokenizer
    )
    result = token_classifier(cve_text)
    vendor = []
    product = []
    version = []
    vendor_probs = []
    product_probs = []
    version_probs = []

    for ner_item in result:
        if ner_item['entity_group'] == 'vendor':
            vendor.append(str.lower(ner_item['word'].strip()))
            vendor_probs.append(str.lower(str(ner_item['score'])))
        elif ner_item['entity_group'] == 'product':
            product.append(str.lower(str(ner_item['word'].strip())))
            product_probs.append(str.lower(str(ner_item['score'])))
        elif ner_item['entity_group'] == 'version':
            version.append(str.lower(str(ner_item['word'].strip())))
            version_probs.append(str.lower(str(ner_item['score'])))
    return {'ners': [vendor, product, version], 'scores': [vendor_probs, product_probs, version_probs]}

In [None]:
# Добавляем найденные сущности и вероятности как колонки в датасет
df_test[['ners_list', 'scores_list']] = df_test['descr'].apply(lambda x: extract_ners(x)).apply(pd.Series)
df_test['vendor_ner'], df_test['product_ner'], df_test['version_ner'] = zip(*df_test['ners_list'])
df_test['vendor_score_ner'], df_test['product_score_ner'], df_test['version_score_ner'] = zip(*df_test['scores_list'])

## Шаг 2. Дедубликация

In [263]:
def deduplicate_using_probs(entities, scores):
    """Выбирает лучшую сущность на основании вероятности
    """
    if not entities or len(entities) == 1:
        return entities, scores
        
    max_idx = scores.index(max(scores))
    return [entities[max_idx]], [scores[max_idx]]

In [267]:
df_test[['dedup_vendor_ner', 'dedup_vendor_score']] = (
                df_test[['vendor_ner', 'vendor_score_ner']]
                .apply(
                    lambda row: deduplicate_using_probs(row['vendor_ner'], row['vendor_score_ner']),
                    axis=1,
                    result_type='expand'
                )
            )

In [268]:
df_test[['dedup_product_ner', 'dedup_product_score']] = (
                df_test.apply(
                    lambda row: deduplicate_using_probs(row['product_ner'], row['product_score_ner']),
                    axis=1,
                    result_type='expand'
                )
            )

## Шаг 3. Обработка сущности "версия"

In [273]:
def extract_version(matched):
        """Возвращает нормализованную версию
        """
        if matched:
            version = matched.group('version')
            # Normalize separators (replace '-' with '.' if needed)
            version = version.replace('-', '.')
            return version
        return None

In [274]:
def classify_version_string(version_str: str) -> Tuple[List[str], str]:
    """Классификация именованной сущности 'версия'. 'Версия' может принадлежать
    одной из 5 групп, в зависимости от слов, которые туда входят.
    """
    version_str = str.lower(version_str)

    # through, including
    group_name = 'through'
    group_words = ['through', 'earlier', '<=', 'prior', 'up to', 'up to, and including', 'up to and including', 'older']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '2.1 through 3.17'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:through|earlier|prior|\<\=|up to)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            pattern = (
                r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
            )
            matched = re.search(pattern, version_str, re.IGNORECASE)
            return [extract_version(matched)], f'{group_name} group'

    # before, not including
    group_name = 'before'
    group_words = ['before', '<']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '4.2.x before 4.2.8'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:before)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            # if simple logic
            pattern = (
                r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
            )
            matched = re.search(pattern, version_str, re.IGNORECASE)
            return [extract_version(matched)], f'{group_name} group'

    # after, including
    group_name = 'after'
    group_words = ['after', '>=']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '4.2.x before 4.2.8'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:older|after|\>\=)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            # if simple logic
            pattern = (
                r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
            )
            matched = re.search(pattern, version_str, re.IGNORECASE)
            return [extract_version(matched)], f'{group_name} group'

    # between
    group_name = 'between'
    group_words = ['between', 'to', ' - ']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '4.2.x before 4.2.8'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:between|to)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            # # if simple logic
            # pattern = (
            #     r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)\s*)'  # Version with digits/x and separators
            # )
            # matched = re.search(pattern, version_str, re.IGNORECASE)
            # return extract_version(matched), f'{group_name} group'



    pattern = (
        r'(?:v|version)?\s*'  # Optional 'v' or 'version'
        r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
    )
    matched = re.search(pattern, version_str, re.IGNORECASE)
    return [extract_version(matched)], 'other'

In [275]:
def parse_version(version_str: str) -> List[str | int]:
    """Извлекаем 3 элемента версии
    """
    components = re.findall(r'\d+|x', version_str, re.IGNORECASE)
    parsed = []
    for c in components:
        if c.lower() == 'x':
            parsed.append('x')
        else:
            parsed.append(int(c))
    return parsed

In [276]:
def generate_versions(versions: list, 
                      group_name: str, 
                      debug: bool = False) -> List[str]:
    """Генерирует версии в сответствие с указанной группой версий.
    """
    if group_name == 'other':
        result = [versions[0]]
        version_other = parse_version(versions[0])
        while len(version_other) != 3:
            if len(version_other) > 3:
                result.append('.'.join([str(x) for x in version_other]))
                version_other.pop()
            elif len(version_other) < 3:
                result.append('.'.join([str(x) for x in version_other]))
                version_other.append(0)
        else:
            result.append('.'.join([str(x) for x in version_other]))
        # print(f'result: {result}')
        # print(f'other versions: {other_versions}')
        # result_merged = result + other_versions
        # print(f'joined: {result_merged}')
        return result
        # return versions
    group_type = group_name.split()[0].lower()

    if len(versions) == 1:
        if group_type == 'before':
            return generate_versions(['0.0.0', versions[0]], 'before multi-match', debug=debug)
        elif group_type == 'through':
            return generate_versions(['0.0.0', versions[0]], 'through multi-match', debug=debug)
        # here access DB and query max version?
        elif group_type == 'after':
            return generate_versions([versions[0], '20.0.0'], 'after multi-match', debug=debug)
        else:
            return []
    elif len(versions) >= 1:
        # for ['3.x', '3.1.1']
        # 3.x
        start = parse_version(versions[0])
        len_original_start = len(start)
        # 3.1.1
        end = parse_version(versions[1])
        len_original_end = len(end)
        # normalize versions
        while len(start) != 3:
            if len(start) > 3:
                start.pop()
            elif len(start) < 3:
                start.append(0)

        while len(end) != 3:
            if len(end) > 3:
                end.pop()
            elif len(end) < 3:
                end.append(0)

        possible_values = []
        if debug:
            print(f'start version: {start}, end version: {end}')
            print(f'len_original_end: {len_original_end}')
        for i in range(3):
            # print(f'possible values: {possible_values}')
            # 3
            start_comp = start[i]
            # 3
            end_comp = end[i]
            if debug:
                print(f'Start component: {start_comp}, End component: {end_comp}')

            if start_comp == 'x':
                # Надо как-то проверять, нужно ли генерировать такик большие числа версий
                if 'before' in group_type:
                    max_val = end_comp - 1 if isinstance(end_comp, int) else 99
                else:
                    max_val = end_comp if isinstance(end_comp, int) else 99
                possible_values.append(list(range(0, max_val + 1)))

                continue
            if isinstance(start_comp, int):
                if isinstance(end_comp, str) and end_comp.lower() == 'x':
                    end_comp = 99  # High maximum for 'x' in end
                if start_comp > end_comp:
                    return []
                if start_comp < end_comp:
                    if 'before' in group_type:
                        current_max = end_comp - 1
                    else:
                        current_max = end_comp
                    # possible_values.append(list(range(start_comp, current_max + 1)))
                    possible_values.append(list(range(start_comp, 10)))

                    # Allow any values for remaining components
                    for j in range(i + 1, 3):
                        possible_values.append(list(range(0, 100)))  # Arbitrary high limit
                    break
                else:
                    possible_values.append([start_comp])
            else:
                # print(f'possible values: {possible_values}')
                possible_values.append([0])


        if debug:
            print(f'possible values: {possible_values}')
        if 'x' not in end and 'x' not in start:
            generated_components = list(product(*possible_values))
            if debug:
                print(f'generated components: {generated_components[:10]}')
            generated_components_to_use = []
            for val in generated_components:
                if not (((val[0] == end[0]
                        and val[1] > end[1]) or
                        (val[0] == end[0]
                        and val[1] == end[1]
                        and val[2] > end[2]) or
                        val[0] > end[0])
                    or ((val[0] == start[0]
                         and val[1] < start[1]) or
                        (val[0] == start[0]
                         and val[1] == start[1]
                         and val[2] < start[2])) or
                        val[0] < start[0]):
                    generated_components_to_use.append(val)
            if debug:
                print(f'generated components to use: {generated_components_to_use[:10], generated_components_to_use[-10:]}')
            versions_list = ['.'.join(map(str, v)) for v in generated_components_to_use]

            # return versions_list
        else:
            generated_components = list(product(*possible_values))
            versions_list = ['.'.join(map(str, v)) for v in generated_components]
        if len_original_end == 2 or len_original_start == 2:
            versions_set = []
            for x in versions_list:
                versions_set.append(x.split('.')[:2])
            versions_set = set(['.'.join(y) for y in versions_set])
            # for x in versions_list:
            versions_list.extend(list(versions_set))
        if debug:
            print(versions_list[:10])
        return versions_list
    else:
        # print('last else')
        return []

сгенерируем версии по вышеуказанным правилам

In [283]:
mask_true_version_in_generated = []
d_vers = {}
cve_2_true_version = {}
cve_2_all_versions = {}
for i, row in df_test.iterrows():
    possible_versions = []
    for version_ner in row['version_ner']:
        preprocessed_ner = classify_version_string(version_ner)
        if preprocessed_ner[0][0] is None:
            continue
        generated_versions = generate_versions(*preprocessed_ner)
        possible_versions.extend(generated_versions)
    mask_true_version_in_generated.append(1 if row['version'] in possible_versions else 0)
    cve_2_true_version[row['cve_id']] = row['version']
    cve_2_all_versions[row['cve_id']] = possible_versions

In [None]:
df_test['true_version_in_predicted'] = mask_true_version_in_generated

## Промежуточная проверка качества

In [284]:
# доля правильно сгенерированных версий
sum(mask_true_version_in_generated) / 200

0.565

In [286]:
df_test

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1
1,CVE-2014-7221,722762,teamspeak,teamspeak3,3.0.7.1,TeamSpeak Client 3.0.14 and earlier allows remote authenticated users to cause a denial of service (buffer overflow and application crash) by connecting to a channel with a different client instan...,cpe:2.3:a:teamspeak:teamspeak3:3.0.7.1:*:*:*:client:*:*:*,1,0,"[[], [teamspeak], [3.0.14 and earlier]]","[[], [0.9999192], [0.9999383]]",[],[teamspeak],[3.0.14 and earlier],[],[0.9999192],[0.9999383],[],[],[teamspeak],[0.9999192],0
2,CVE-2018-7279,541558,alienvault,open_source_security_information_management,5.3,A remote code execution issue was discovered in AlienVault USM and OSSIM before 5.5.1.,cpe:2.3:a:alienvault:open_source_security_information_management:5.3:*:*:*:*:*:*:*,1,0,"[[alienvault], [usm], [before 5.5.1.]]","[[0.9999037], [0.9814162], [0.99995714]]",[alienvault],[usm],[before 5.5.1.],[0.9999037],[0.9814162],[0.99995714],[alienvault],[0.9999037],[usm],[0.9814162],0
3,CVE-2020-24743,472694,zohocorp,manageengine_applications_manager,14.5,"An issue was found in /showReports.do Zoho ManageEngine Applications Manager up to 14550, allows attackers to gain escalated privileges via the resourceid parameter.",cpe:2.3:a:zohocorp:manageengine_applications_manager:14.5:build14540:*:*:*:*:*:*,0,0,"[[manageengine], [applications manager], [up to 14550,]]","[[0.95109195], [0.888083], [0.9999476]]",[manageengine],[applications manager],"[up to 14550,]",[0.95109195],[0.888083],[0.9999476],[manageengine],[0.95109195],[applications manager],[0.888083],0
4,CVE-2020-24786,472744,zohocorp,manageengine_o365_manager_plus,4.3,"An issue was discovered in Zoho ManageEngine Exchange Reporter Plus before build number 5510, AD360 before build number 4228, ADSelfService Plus before build number 5817, DataSecurity Plus before ...",cpe:2.3:a:zohocorp:manageengine_o365_manager_plus:4.3:4304:*:*:*:*:*:*,0,0,"[[zoho], [manageengine exchange reporter plus, adselfservice plus, datasecurity plus, recovermanager plus, eventlog analyzer, adaudit plus, o365 manager plus, cloud security plus, admanager plus, ...","[[0.98405576], [0.9567048, 0.88903725, 0.9884461, 0.9938674, 0.98616344, 0.9817395, 0.962745, 0.9839506, 0.94795024, 0.59385043, 0.99802125], []]",[zoho],"[manageengine exchange reporter plus, adselfservice plus, datasecurity plus, recovermanager plus, eventlog analyzer, adaudit plus, o365 manager plus, cloud security plus, admanager plus, log360, j...",[],[0.98405576],"[0.9567048, 0.88903725, 0.9884461, 0.9938674, 0.98616344, 0.9817395, 0.962745, 0.9839506, 0.94795024, 0.59385043, 0.99802125]",[],[zoho],[0.98405576],[java servlet],[0.99802125],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,CVE-2022-3550,169910,x.org,x_server,1.13.1,A vulnerability classified as critical was found in X.org Server. Affected by this vulnerability is the function _GetCountedString of the file xkb/xkb.c. The manipulation leads to buffer overflow....,cpe:2.3:a:x.org:x_server:1.13.1:*:*:*:*:*:*:*,1,0,"[[], [x.org server.], []]","[[], [0.9496948], []]",[],[x.org server.],[],[],[0.9496948],[],[],[],[x.org server.],[0.9496948],0
196,CVE-2020-11673,451620,total-soft,responsive_poll,1.2.2,"An issue was discovered in the Responsive Poll through 1.3.4 for Wordpress. It allows an unauthenticated user to manipulate polls, e.g., delete, clone, or view a hidden poll. This is due to the us...",cpe:2.3:a:total-soft:responsive_poll:1.2.2:*:*:*:*:wordpress:*:*,1,0,"[[], [responsive poll], [through 1.3.4]]","[[], [0.9997986], [0.9999593]]",[],[responsive poll],[through 1.3.4],[],[0.9997986],[0.9999593],[],[],[responsive poll],[0.9997986],1
197,CVE-2022-1253,80875,struktur,libde265,1.0.3,Heap-based Buffer Overflow in GitHub repository strukturag/libde265 prior to and including 1.0.8. The fix is established in commit 8e89fe0e175d2870c39486fdd09250b230ec10b8 but does not yet belong ...,cpe:2.3:a:struktur:libde265:1.0.3:*:*:*:*:*:*:*,1,1,"[[], [], [prior to and including 1.0.8.]]","[[], [], [0.99994123]]",[],[],[prior to and including 1.0.8.],[],[],[0.99994123],[],[],[],[],1
198,CVE-2019-20903,705053,atlassian,editor-core,98.2.2,The hyperlinks functionality in atlaskit/editor-core in before version 113.1.5 allows remote attackers to inject arbitrary HTML or JavaScript via a Cross-Site Scripting (XSS) vulnerability in link...,cpe:2.3:a:atlassian:editor-core:98.2.2:*:*:*:*:node.js:*:*,0,1,"[[], [], [before version 113.1.5]]","[[], [], [0.99994594]]",[],[],[before version 113.1.5],[],[],[0.99994594],[],[],[],[],0


In [292]:
count_vendor = 0
count_product = 0
for i, row in df_test.iterrows():

    if row['dedup_vendor_ner']:
        if (row['vendor'] == row['dedup_vendor_ner'][0]
            or row['vendor'] == '_'.join(row['dedup_vendor_ner'][0].split())):
            count_vendor += 1
    if row['dedup_product_ner']:
        if (row['product'] == row['dedup_product_ner'][0]
            or row['product'] == '_'.join(row['dedup_product_ner'][0].split())):
            count_product += 1    
        else:
            print(f"Предсказанный продукт: {row['dedup_product_ner']}.\nИстинный продукт: {row['product']}", end='\n\n')
            print(f"Предсказанный вендор: {row['dedup_vendor_ner']}.\nИстинный вендор: {row['vendor']}", end='\n\n****************\n\n')

Предсказанный продукт: ['teamspeak'].
Истинный продукт: teamspeak3

Предсказанный вендор: [].
Истинный вендор: teamspeak

****************

Предсказанный продукт: ['usm'].
Истинный продукт: open_source_security_information_management

Предсказанный вендор: ['alienvault'].
Истинный вендор: alienvault

****************

Предсказанный продукт: ['applications manager'].
Истинный продукт: manageengine_applications_manager

Предсказанный вендор: ['manageengine'].
Истинный вендор: zohocorp

****************

Предсказанный продукт: ['java servlet'].
Истинный продукт: manageengine_o365_manager_plus

Предсказанный вендор: ['zoho'].
Истинный вендор: zohocorp

****************

Предсказанный продукт: ['oauth'].
Истинный продукт: aspnet

Предсказанный вендор: ['auth0'].
Истинный вендор: auth0

****************

Предсказанный продукт: ['tagboard'].
Истинный продукт: tagmin_control_center

Предсказанный вендор: [].
Истинный вендор: paul_schudar

****************

Предсказанный продукт: ['wordpress'].

Число правильно извлеченных названий для вендора и продукта.

In [293]:
print(count_vendor, count_product)

42 93


In [None]:
product_ner = df_test['dedup_product_ner'].astype(str).apply(lambda x: x.lstrip('[\'').rstrip(']\'')).values
vendor_ner = df_test['dedup_vendor_ner'].astype(str).apply(lambda x: x.lstrip('[\'').rstrip(']\'')).values

## Шаг 4. Поиск кандидатов для извлеченной сущности "продукт" в БД

С использование Левенштейна, НОП и кастомного алгоритма.

In [356]:
unique_products = get_df_from_bd('select distinct product from cpes limit 10000000;')
unique_products = unique_products['product'].unique()

In [325]:
def get_ratio(ner_name: str, 
              unique_entities: np.ndarray) -> Tuple[str, float]:
    """Поиск в списке наиболее близкого слова к извлеченной сущности
    на основе расстояния Левенштейна. 
    """
    if ner_name in unique_entities:
        return ner_name, 1
    ratio_scores = []
    for ent in unique_entities:
        ratio_scores.append(ratio(ner_name, ent))
    ratio_scores = np.array(ratio_scores)
    
    if len(np.argwhere(ratio_scores == np.max(ratio_scores))) <= 1:
        return unique_entities[np.argmax(ratio_scores)], np.max(ratio_scores)
    else:
        candidates = unique_entities[np.argwhere(ratio_scores == np.max(ratio_scores))]
        len_of_query = len(ner_name)
        d = -1
        fit_cand = ''
        for cand in candidates:
            cand = cand[0]
            diff = abs(len_of_query - len(cand))
            # print(cand, d, diff)
            if d == -1:
                d = diff
                fit_cand = cand
            elif d > diff:
                d = diff
                fit_cand = cand
            else:
                continue
        return fit_cand, np.max(ratio_scores)

In [324]:
def get_lcs(ner_name: str, 
            unique_entities: np.ndarray) -> Tuple[str, float]:
    """Поиск в списке наиболее близкого слова к извлеченной сущности
    на основе метода наибольшей общей подпоследовательности. 
    """
    if ner_name in unique_entities:
        return ner_name, 1
    lcs_scores = np.array(pylcs.lcs_of_list(ner_name, unique_entities))
    if len(np.argwhere(lcs_scores == np.max(lcs_scores))) <= 1:
        return unique_entities[np.argmax(lcs_scores)], np.max(lcs_scores)
    else:
        candidates = unique_entities[np.argwhere(lcs_scores == np.max(lcs_scores))]
        len_of_query = len(ner_name)
        d = -1
        fit_cand = ''
        for cand in candidates:
            cand = cand[0]
            diff = abs(len_of_query - len(cand))
            # print(cand, d, diff)
            if d == -1:
                d = diff
                fit_cand = cand
            elif d > diff:
                d = diff
                fit_cand = cand
            else:
                continue
        return fit_cand, np.max(lcs_scores)


In [305]:
# # Проверка работы
# ratio_scores = []
# for ent in unique_products:
#     ratio_scores.append(ratio('ox app suite', ent))
# ratio_scores = np.array(ratio_scores)
# print(sorted(ratio_scores, reverse=True)[:5])
# n = list(np.argwhere(ratio_scores >= 0.7).reshape(1, -1))
# unique_products[n]

In [326]:
def retrieve_top_k(ner_name: str, 
                   unique_entities: np.ndarray, 
                   top_k: int = 5):
    """Находит топ k слов из предложенного списка
    на основе метода наибольшей общей подпоследовательности. 
    """
    lcs_scores = np.array(pylcs.lcs_of_list(ner_name, unique_entities))
    l = [(name, round(score/len(name), 4)) for name, score in zip(unique_entities, lcs_scores)]
    ll = sorted(l, key=lambda x: x[1], reverse=True)[:top_k]
    return list(zip(*ll))[0]

Сравнение качества данных алгоритмов

1. Левенштейн

In [357]:
matched_db_product_lev = []
matched_db_vendor_lev = []
score_lev = []
for pr in tqdm(product_ner):
    if pr:
        # print(f'Product NER: {pr}')
        (prod, score)= get_ratio(pr, unique_products)
        # print(f'Found product in DB: {prod}')
        # print(f'score: {score}', end='\n\n')
        df_found_prod = get_df_from_bd(f"select * from cpes where product = '{prod}' limit 1;")
        matched_db_product_lev.append(df_found_prod['product'].values[0])
        matched_db_vendor_lev.append(df_found_prod['vendor'].values[0])
        score_lev.append(score)
        
    else:
        matched_db_product_lev.append('')
        matched_db_vendor_lev.append('')
        score_lev.append(0)

100%|██████████| 200/200 [00:08<00:00, 23.51it/s]


2. НОП

In [360]:
matched_db_product_lcs = []
matched_db_vendor_lcs = []
for pr in tqdm(product_ner):
    if pr:
        # print(f'Product NER: {pr}')
        (prod, score)= get_lcs(pr, unique_products)
        # print(f'Found product in DB: {prod}')
        # print(f'score: {score}', end='\n\n')
        df_found_prod = get_df_from_bd(f"select * from cpes where product = '{prod}' limit 1;")
        matched_db_vendor_lcs.append(df_found_prod['vendor'].values[0])
        matched_db_product_lcs.append(df_found_prod['product'].values[0])
    else:
        matched_db_product_lcs.append('')
        matched_db_vendor_lcs.append('')

100%|██████████| 200/200 [00:15<00:00, 12.73it/s]


3. Кастомный

In [361]:
matched_db_product_adv = []
matched_db_vendor_adv = []
score_lev = []
for pr in tqdm(product_ner):
    if pr:
        # print(f'Product NER: {pr}')
        (prod, score)= get_lcs(pr, unique_products)
        # print(f'Found product in DB: {prod}')
        df_all = get_df_from_bd(f"select distinct vendor, product from cpes where vendor in (select vendor from cpes where product = '{prod}')")
        found_candidates = retrieve_top_k(prod, df_all['product'].tolist(), top_k=3)
        matched_db_product_adv.append(found_candidates)
        matched_db_vendor_adv.append(df_all['vendor'].values[0])
        score_lev.append(score)
        
    else:
        matched_db_product_adv.append([''])
        matched_db_vendor_adv.append('')
        score_lev.append(0)

100%|██████████| 200/200 [00:24<00:00,  8.18it/s]


In [364]:
df_test['matched_db_product_lev'] = matched_db_product_lev
df_test['matched_db_vendor_lev'] = matched_db_vendor_lev
df_test['score_lev'] = score_lev

df_test['matched_db_product_lcs'] = matched_db_product_lcs
df_test['matched_db_vendor_lcs'] = matched_db_vendor_lcs

df_test['matched_db_vendor_adv'] = matched_db_vendor_adv
df_test['matched_db_vendor_adv'] = df_test['matched_db_vendor_adv'].astype(str)
df_test['matched_db_product_adv'] = matched_db_product_adv
df_test['matched_db_product_adv'] = df_test['matched_db_product_adv'].astype(str)

Оценка точности

In [367]:
print(df_test[(df_test['matched_db_product_lev'] == df_test['product'])].shape[0] / 200)
print(df_test[(df_test['matched_db_product_lcs'] == df_test['product'])].shape[0] / 200)
print(df_test.apply(lambda x: x['product'] in x['matched_db_product_adv'], axis=1).sum() / 200)

0.545
0.52
0.58


## Итоговая оценка пайплайна

In [144]:
df_test[(df_test['matched_db_vendor'] == df_test['vendor']) &
        (df_test['matched_db_product_lev'] != df_test['product'])].sort_values('cve_id')[['cve_id', 'vendor', 'matched_db_vendor_lev', 'matched_db_vendor', 'product', 'product_ner', 'matched_db_product_lev', 'matched_db_product', 'version', 'score_lev']]

Unnamed: 0,cve_id,vendor,matched_db_vendor_lev,matched_db_vendor,product,product_ner,matched_db_product_lev,matched_db_product,version,score_lev
1,CVE-2014-7221,teamspeak,teamspeak,teamspeak,teamspeak3,[teamspeak],teamspeak,teamspeak,3.0.7.1,1.0
159,CVE-2017-6753,cisco,cisco,cisco,webex_meetings_server_2.6_mr2_patch,"[webex, webex meetings server,, webex centers, center,, webex meetings, webex, webex, webex]",webex_meetings,webex_meetings,1,0.928571
162,CVE-2018-1000417,jenkins,jenkins,jenkins,email_extension_template,"[jenkins email, template plugin]",jenkins,jenkins-cloudformation-plugin,0.2,0.7
124,CVE-2018-1000831,k9mail,kde,k9mail,k-9_mail,[k9mail],kmail,k-9_mail,2.506,0.909091
161,CVE-2020-15003,open-xchange,open-xchange,open-xchange,open-xchange_appsuite,[ox app suite],ox_app_suite,ox_app_suite,7.10.3,0.833333
123,CVE-2020-16116,kde,entropymine,kde,ark,[kde ark],deark,kde_frameworks,4.7.0,0.833333
82,CVE-2020-2090,jenkins,jenkins,jenkins,amazon_ec2,[ec2],ec2,ec2,1.9,1.0
189,CVE-2020-4038,prisma,prisma,prisma,graphql-playground-html,"[graphql-playground-html, graphql-playground-middleware-express, graphql-playground-middleware-koa, graphql-playground-middleware-lambda, graphql-playground-middleware-hapi]",graphql-playground-middleware-hapi,graphql-playground-middleware-hapi,1.4.3,1.0
194,CVE-2021-24472,qantumthemes,qantumthemes,qantumthemes,onair2,"[wordpress, kentharadio, wordpress]",kentharadio,kentharadio,3.9.3,1.0
95,CVE-2021-24484,ays-pro,content_rating_project,ays-pro,secure_copy_content_protection_and_content_locking,"[content locking, wordpress]",content_rating,secure_copy_content_protection_and_content_locking,2.4.6,0.689655


* Для CVE-2007-6487 продукта webgui есть вендор plainblack и plain_black в БД

* Для CVE-2004-0095 есть продукт epolicy_orchestrator_agent и epolicy_orchestrator в БД

* Для CVE-2007-3381 продукт gdm -- абревиатура, не найти такой продукт в БД

* CVE-2013-6440 есть продукт opensaml и opensaml_java в БД

* CVE-2014-7221 есть продукт teamspeak и teamspeak3 в БД

* CVE-2020-15003 дубли продукта в БД open-xchange_appsuite и ox_app_suite, оба версии 7.10.5

* CVE-2020-2090 дубли продукта в БД ec и amazon_ec2, оба версии 1.8

Много ошибок связано с wordpress