In [77]:
from collections import Counter
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                         pipeline)
import pandas as pd
import re
from itertools import product
import numpy as np
import pylcs
import psycopg2 as p2
from tqdm import tqdm
from Levenshtein import ratio
from typing import Dict, Optional, List, Tuple
import torch

pd.set_option('display.width', 20000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

dbname = "vulns_scanner"
user = 'postgres'
password = 'postgres'
host = 'localhost'
port = '5432'

# Алгоритм поиска и генерации конфигураций

## Шаг 1. Извлечение именованных сущностей

In [2]:
def get_df_from_bd(q: str,
                   dbname: str = "vulns_scanner",
                   user: str = 'postgres',
                   password: str = 'postgres',
                   host: str = 'localhost',
                   port: str = '5432') -> pd.DataFrame:
    """Выполняет результат переданного запроса к БД.
    """
    conn = p2.connect(dbname=dbname, user=user, password=password, host=host, port=port)
    cur = conn.cursor()
    cur.execute(q)
    colnames = [desc[0] for desc in cur.description]
    tuples = cur.fetchall()
    cur.close()
    df = pd.DataFrame(tuples, columns=colnames)
    return df

In [3]:
path_to_model = "/home/mikhail/Documents/pandan_study/vkr/vulns_scanner/mikhail_code/models/nuner_180525_full_dataset"
final_tokenizer = AutoTokenizer.from_pretrained(path_to_model, use_fast=True, add_prefix_space=True, local_files_only=True)
final_model = AutoModelForTokenClassification.from_pretrained(path_to_model, local_files_only=True)

In [4]:
# тестовый датасет для проверки алгоритма
df_test = pd.read_csv('df_200_not_in_stucco_v3_180525.csv')

In [5]:
def extract_ners(cve_text: str, 
                 tokenizer=final_tokenizer, 
                 model=final_model):
    """Применяет NER модель на тексте и возвращает найденные сущности и их вероятности. 
    """
    token_classifier = pipeline(
        "token-classification", model=final_model, aggregation_strategy="first", tokenizer=final_tokenizer
    )
    result = token_classifier(cve_text)
    vendor = []
    product = []
    version = []
    vendor_probs = []
    product_probs = []
    version_probs = []

    for ner_item in result:
        if ner_item['entity_group'] == 'vendor':
            vendor.append(str.lower(ner_item['word'].strip()))
            vendor_probs.append(str.lower(str(ner_item['score'])))
        elif ner_item['entity_group'] == 'product':
            product.append(str.lower(str(ner_item['word'].strip())))
            product_probs.append(str.lower(str(ner_item['score'])))
        elif ner_item['entity_group'] == 'version':
            version.append(str.lower(str(ner_item['word'].strip())))
            version_probs.append(str.lower(str(ner_item['score'])))
    return {'ners': [vendor, product, version], 'scores': [vendor_probs, product_probs, version_probs]}

In [6]:
# Добавляем найденные сущности и вероятности как колонки в датасет
df_test[['ners_list', 'scores_list']] = df_test['descr'].apply(lambda x: extract_ners(x)).apply(pd.Series)
df_test['vendor_ner'], df_test['product_ner'], df_test['version_ner'] = zip(*df_test['ners_list'])
df_test['vendor_score_ner'], df_test['product_score_ner'], df_test['version_score_ner'] = zip(*df_test['scores_list'])

Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set

## Шаг 2. Дедубликация

In [7]:
def deduplicate_using_probs(entities, scores):
    """Выбирает лучшую сущность на основании вероятности
    """
    if not entities or len(entities) == 1:
        return entities, scores
        
    max_idx = scores.index(max(scores))
    return [entities[max_idx]], [scores[max_idx]]

In [8]:
df_test[['dedup_vendor_ner', 'dedup_vendor_score']] = (
                df_test[['vendor_ner', 'vendor_score_ner']]
                .apply(
                    lambda row: deduplicate_using_probs(row['vendor_ner'], row['vendor_score_ner']),
                    axis=1,
                    result_type='expand'
                )
            )

In [9]:
df_test[['dedup_product_ner', 'dedup_product_score']] = (
                df_test.apply(
                    lambda row: deduplicate_using_probs(row['product_ner'], row['product_score_ner']),
                    axis=1,
                    result_type='expand'
                )
            )

## Шаг 3. Обработка сущности "версия"

In [10]:
def extract_version(matched):
        """Возвращает нормализованную версию
        """
        if matched:
            version = matched.group('version')
            # Normalize separators (replace '-' with '.' if needed)
            version = version.replace('-', '.')
            return version
        return None

In [11]:
def classify_version_string(version_str: str) -> Tuple[List[str], str]:
    """Классификация именованной сущности 'версия'. 'Версия' может принадлежать
    одной из 5 групп, в зависимости от слов, которые туда входят.
    """
    version_str = str.lower(version_str)

    # through, including
    group_name = 'through'
    group_words = ['through', 'earlier', '<=', 'prior', 'up to', 'up to, and including', 'up to and including', 'older']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '2.1 through 3.17'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:through|earlier|prior|\<\=|up to)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            pattern = (
                r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
            )
            matched = re.search(pattern, version_str, re.IGNORECASE)
            return [extract_version(matched)], f'{group_name} group'

    # before, not including
    group_name = 'before'
    group_words = ['before', '<']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '4.2.x before 4.2.8'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:before)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            # if simple logic
            pattern = (
                r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
            )
            matched = re.search(pattern, version_str, re.IGNORECASE)
            return [extract_version(matched)], f'{group_name} group'

    # after, including
    group_name = 'after'
    group_words = ['after', '>=']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '4.2.x before 4.2.8'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:older|after|\>\=)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            # if simple logic
            pattern = (
                r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
            )
            matched = re.search(pattern, version_str, re.IGNORECASE)
            return [extract_version(matched)], f'{group_name} group'

    # between
    group_name = 'between'
    group_words = ['between', 'to', ' - ']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '4.2.x before 4.2.8'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:between|to)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            # # if simple logic
            # pattern = (
            #     r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)\s*)'  # Version with digits/x and separators
            # )
            # matched = re.search(pattern, version_str, re.IGNORECASE)
            # return extract_version(matched), f'{group_name} group'



    pattern = (
        r'(?:v|version)?\s*'  # Optional 'v' or 'version'
        r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
    )
    matched = re.search(pattern, version_str, re.IGNORECASE)
    return [extract_version(matched)], 'other'

In [12]:
def parse_version(version_str: str) -> List[str | int]:
    """Извлекаем 3 элемента версии
    """
    components = re.findall(r'\d+|x', version_str, re.IGNORECASE)
    parsed = []
    for c in components:
        if c.lower() == 'x':
            parsed.append('x')
        else:
            parsed.append(int(c))
    return parsed

In [13]:
def generate_versions(versions: list, 
                      group_name: str, 
                      debug: bool = False) -> List[str]:
    """Генерирует версии в сответствие с указанной группой версий.
    """
    if group_name == 'other':
        result = [versions[0]]
        version_other = parse_version(versions[0])
        while len(version_other) != 3:
            if len(version_other) > 3:
                result.append('.'.join([str(x) for x in version_other]))
                version_other.pop()
            elif len(version_other) < 3:
                result.append('.'.join([str(x) for x in version_other]))
                version_other.append(0)
        else:
            result.append('.'.join([str(x) for x in version_other]))
        # print(f'result: {result}')
        # print(f'other versions: {other_versions}')
        # result_merged = result + other_versions
        # print(f'joined: {result_merged}')
        return result
        # return versions
    group_type = group_name.split()[0].lower()

    if len(versions) == 1:
        if group_type == 'before':
            return generate_versions(['0.0.0', versions[0]], 'before multi-match', debug=debug)
        elif group_type == 'through':
            return generate_versions(['0.0.0', versions[0]], 'through multi-match', debug=debug)
        # here access DB and query max version?
        elif group_type == 'after':
            return generate_versions([versions[0], '20.0.0'], 'after multi-match', debug=debug)
        else:
            return []
    elif len(versions) >= 1:
        # for ['3.x', '3.1.1']
        # 3.x
        start = parse_version(versions[0])
        len_original_start = len(start)
        # 3.1.1
        end = parse_version(versions[1])
        len_original_end = len(end)
        # normalize versions
        while len(start) != 3:
            if len(start) > 3:
                start.pop()
            elif len(start) < 3:
                start.append(0)

        while len(end) != 3:
            if len(end) > 3:
                end.pop()
            elif len(end) < 3:
                end.append(0)

        possible_values = []
        if debug:
            print(f'start version: {start}, end version: {end}')
            print(f'len_original_end: {len_original_end}')
        for i in range(3):
            # print(f'possible values: {possible_values}')
            # 3
            start_comp = start[i]
            # 3
            end_comp = end[i]
            if debug:
                print(f'Start component: {start_comp}, End component: {end_comp}')

            if start_comp == 'x':
                # Надо как-то проверять, нужно ли генерировать такик большие числа версий
                if 'before' in group_type:
                    max_val = end_comp - 1 if isinstance(end_comp, int) else 99
                else:
                    max_val = end_comp if isinstance(end_comp, int) else 99
                possible_values.append(list(range(0, max_val + 1)))

                continue
            if isinstance(start_comp, int):
                if isinstance(end_comp, str) and end_comp.lower() == 'x':
                    end_comp = 99  # High maximum for 'x' in end
                if start_comp > end_comp:
                    return []
                if start_comp < end_comp:
                    if 'before' in group_type:
                        current_max = end_comp - 1
                    else:
                        current_max = end_comp
                    # possible_values.append(list(range(start_comp, current_max + 1)))
                    possible_values.append(list(range(start_comp, 10)))

                    # Allow any values for remaining components
                    for j in range(i + 1, 3):
                        possible_values.append(list(range(0, 100)))  # Arbitrary high limit
                    break
                else:
                    possible_values.append([start_comp])
            else:
                # print(f'possible values: {possible_values}')
                possible_values.append([0])


        if debug:
            print(f'possible values: {possible_values}')
        if 'x' not in end and 'x' not in start:
            generated_components = list(product(*possible_values))
            if debug:
                print(f'generated components: {generated_components[:10]}')
            generated_components_to_use = []
            for val in generated_components:
                if not (((val[0] == end[0]
                        and val[1] > end[1]) or
                        (val[0] == end[0]
                        and val[1] == end[1]
                        and val[2] > end[2]) or
                        val[0] > end[0])
                    or ((val[0] == start[0]
                         and val[1] < start[1]) or
                        (val[0] == start[0]
                         and val[1] == start[1]
                         and val[2] < start[2])) or
                        val[0] < start[0]):
                    generated_components_to_use.append(val)
            if debug:
                print(f'generated components to use: {generated_components_to_use[:10], generated_components_to_use[-10:]}')
            versions_list = ['.'.join(map(str, v)) for v in generated_components_to_use]

            # return versions_list
        else:
            generated_components = list(product(*possible_values))
            versions_list = ['.'.join(map(str, v)) for v in generated_components]
        if len_original_end == 2 or len_original_start == 2:
            versions_set = []
            for x in versions_list:
                versions_set.append(x.split('.')[:2])
            versions_set = set(['.'.join(y) for y in versions_set])
            # for x in versions_list:
            versions_list.extend(list(versions_set))
        if debug:
            print(versions_list[:10])
        return versions_list
    else:
        # print('last else')
        return []

сгенерируем версии по вышеуказанным правилам

In [14]:
mask_true_version_in_generated = []
d_vers = {}
cve_2_true_version = {}
cve_2_all_versions = {}
for i, row in df_test.iterrows():
    possible_versions = []
    for version_ner in row['version_ner']:
        preprocessed_ner = classify_version_string(version_ner)
        if preprocessed_ner[0][0] is None:
            continue
        generated_versions = generate_versions(*preprocessed_ner)
        possible_versions.extend(generated_versions)
    mask_true_version_in_generated.append(1 if row['version'] in possible_versions else 0)
    cve_2_true_version[row['cve_id']] = row['version']
    cve_2_all_versions[row['cve_id']] = possible_versions

In [15]:
df_test['true_version_in_predicted'] = mask_true_version_in_generated

## Промежуточная проверка качества

In [16]:
# доля правильно сгенерированных версий
sum(mask_true_version_in_generated) / 200

0.565

In [17]:
df_test

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1
1,CVE-2014-7221,722762,teamspeak,teamspeak3,3.0.7.1,TeamSpeak Client 3.0.14 and earlier allows remote authenticated users to cause a denial of service (buffer overflow and application crash) by connecting to a channel with a different client instan...,cpe:2.3:a:teamspeak:teamspeak3:3.0.7.1:*:*:*:client:*:*:*,1,0,"[[], [teamspeak], [3.0.14 and earlier]]","[[], [0.9999192], [0.9999383]]",[],[teamspeak],[3.0.14 and earlier],[],[0.9999192],[0.9999383],[],[],[teamspeak],[0.9999192],0
2,CVE-2018-7279,541558,alienvault,open_source_security_information_management,5.3,A remote code execution issue was discovered in AlienVault USM and OSSIM before 5.5.1.,cpe:2.3:a:alienvault:open_source_security_information_management:5.3:*:*:*:*:*:*:*,1,0,"[[alienvault], [usm], [before 5.5.1.]]","[[0.9999037], [0.9814162], [0.99995714]]",[alienvault],[usm],[before 5.5.1.],[0.9999037],[0.9814162],[0.99995714],[alienvault],[0.9999037],[usm],[0.9814162],0
3,CVE-2020-24743,472694,zohocorp,manageengine_applications_manager,14.5,"An issue was found in /showReports.do Zoho ManageEngine Applications Manager up to 14550, allows attackers to gain escalated privileges via the resourceid parameter.",cpe:2.3:a:zohocorp:manageengine_applications_manager:14.5:build14540:*:*:*:*:*:*,0,0,"[[manageengine], [applications manager], [up to 14550,]]","[[0.95109195], [0.888083], [0.9999476]]",[manageengine],[applications manager],"[up to 14550,]",[0.95109195],[0.888083],[0.9999476],[manageengine],[0.95109195],[applications manager],[0.888083],0
4,CVE-2020-24786,472744,zohocorp,manageengine_o365_manager_plus,4.3,"An issue was discovered in Zoho ManageEngine Exchange Reporter Plus before build number 5510, AD360 before build number 4228, ADSelfService Plus before build number 5817, DataSecurity Plus before ...",cpe:2.3:a:zohocorp:manageengine_o365_manager_plus:4.3:4304:*:*:*:*:*:*,0,0,"[[zoho], [manageengine exchange reporter plus, adselfservice plus, datasecurity plus, recovermanager plus, eventlog analyzer, adaudit plus, o365 manager plus, cloud security plus, admanager plus, ...","[[0.98405576], [0.9567048, 0.88903725, 0.9884461, 0.9938674, 0.98616344, 0.9817395, 0.962745, 0.9839506, 0.94795024, 0.59385043, 0.99802125], []]",[zoho],"[manageengine exchange reporter plus, adselfservice plus, datasecurity plus, recovermanager plus, eventlog analyzer, adaudit plus, o365 manager plus, cloud security plus, admanager plus, log360, j...",[],[0.98405576],"[0.9567048, 0.88903725, 0.9884461, 0.9938674, 0.98616344, 0.9817395, 0.962745, 0.9839506, 0.94795024, 0.59385043, 0.99802125]",[],[zoho],[0.98405576],[java servlet],[0.99802125],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,CVE-2022-3550,169910,x.org,x_server,1.13.1,A vulnerability classified as critical was found in X.org Server. Affected by this vulnerability is the function _GetCountedString of the file xkb/xkb.c. The manipulation leads to buffer overflow....,cpe:2.3:a:x.org:x_server:1.13.1:*:*:*:*:*:*:*,1,0,"[[], [x.org server.], []]","[[], [0.9496948], []]",[],[x.org server.],[],[],[0.9496948],[],[],[],[x.org server.],[0.9496948],0
196,CVE-2020-11673,451620,total-soft,responsive_poll,1.2.2,"An issue was discovered in the Responsive Poll through 1.3.4 for Wordpress. It allows an unauthenticated user to manipulate polls, e.g., delete, clone, or view a hidden poll. This is due to the us...",cpe:2.3:a:total-soft:responsive_poll:1.2.2:*:*:*:*:wordpress:*:*,1,0,"[[], [responsive poll], [through 1.3.4]]","[[], [0.9997986], [0.9999593]]",[],[responsive poll],[through 1.3.4],[],[0.9997986],[0.9999593],[],[],[responsive poll],[0.9997986],1
197,CVE-2022-1253,80875,struktur,libde265,1.0.3,Heap-based Buffer Overflow in GitHub repository strukturag/libde265 prior to and including 1.0.8. The fix is established in commit 8e89fe0e175d2870c39486fdd09250b230ec10b8 but does not yet belong ...,cpe:2.3:a:struktur:libde265:1.0.3:*:*:*:*:*:*:*,1,1,"[[], [], [prior to and including 1.0.8.]]","[[], [], [0.99994123]]",[],[],[prior to and including 1.0.8.],[],[],[0.99994123],[],[],[],[],1
198,CVE-2019-20903,705053,atlassian,editor-core,98.2.2,The hyperlinks functionality in atlaskit/editor-core in before version 113.1.5 allows remote attackers to inject arbitrary HTML or JavaScript via a Cross-Site Scripting (XSS) vulnerability in link...,cpe:2.3:a:atlassian:editor-core:98.2.2:*:*:*:*:node.js:*:*,0,1,"[[], [], [before version 113.1.5]]","[[], [], [0.99994594]]",[],[],[before version 113.1.5],[],[],[0.99994594],[],[],[],[],0


In [18]:
count_vendor = 0
count_product = 0
for i, row in df_test.iterrows():

    if row['dedup_vendor_ner']:
        if (row['vendor'] == row['dedup_vendor_ner'][0]
            or row['vendor'] == '_'.join(row['dedup_vendor_ner'][0].split())):
            count_vendor += 1
    if row['dedup_product_ner']:
        if (row['product'] == row['dedup_product_ner'][0]
            or row['product'] == '_'.join(row['dedup_product_ner'][0].split())):
            count_product += 1    
        else:
            print(f"Предсказанный продукт: {row['dedup_product_ner']}.\nИстинный продукт: {row['product']}", end='\n\n')
            print(f"Предсказанный вендор: {row['dedup_vendor_ner']}.\nИстинный вендор: {row['vendor']}", end='\n\n****************\n\n')

Предсказанный продукт: ['teamspeak'].
Истинный продукт: teamspeak3

Предсказанный вендор: [].
Истинный вендор: teamspeak

****************

Предсказанный продукт: ['usm'].
Истинный продукт: open_source_security_information_management

Предсказанный вендор: ['alienvault'].
Истинный вендор: alienvault

****************

Предсказанный продукт: ['applications manager'].
Истинный продукт: manageengine_applications_manager

Предсказанный вендор: ['manageengine'].
Истинный вендор: zohocorp

****************

Предсказанный продукт: ['java servlet'].
Истинный продукт: manageengine_o365_manager_plus

Предсказанный вендор: ['zoho'].
Истинный вендор: zohocorp

****************

Предсказанный продукт: ['oauth'].
Истинный продукт: aspnet

Предсказанный вендор: ['auth0'].
Истинный вендор: auth0

****************

Предсказанный продукт: ['tagboard'].
Истинный продукт: tagmin_control_center

Предсказанный вендор: [].
Истинный вендор: paul_schudar

****************

Предсказанный продукт: ['wordpress'].

Число правильно извлеченных названий для вендора и продукта.

In [55]:
print(count_vendor/200, count_product/200)

0.21 0.465


In [20]:
product_ner = df_test['dedup_product_ner'].astype(str).apply(lambda x: x.lstrip('[\'').rstrip(']\'')).values
vendor_ner = df_test['dedup_vendor_ner'].astype(str).apply(lambda x: x.lstrip('[\'').rstrip(']\'')).values

## Шаг 4. Поиск кандидатов для извлеченной сущности "продукт" в БД

С использование Левенштейна, НОП и кастомного алгоритма.

In [21]:
unique_products = get_df_from_bd('select distinct product from cpes limit 10000000;')
unique_products = unique_products['product'].unique()

In [22]:
def get_ratio(ner_name: str, 
              unique_entities: np.ndarray) -> Tuple[str, float]:
    """Поиск в списке наиболее близкого слова к извлеченной сущности
    на основе расстояния Левенштейна. 
    """
    if ner_name in unique_entities:
        return ner_name, 1
    ratio_scores = []
    for ent in unique_entities:
        ratio_scores.append(ratio(ner_name, ent))
    ratio_scores = np.array(ratio_scores)
    
    if len(np.argwhere(ratio_scores == np.max(ratio_scores))) <= 1:
        return unique_entities[np.argmax(ratio_scores)], np.max(ratio_scores)
    else:
        candidates = unique_entities[np.argwhere(ratio_scores == np.max(ratio_scores))]
        len_of_query = len(ner_name)
        d = -1
        fit_cand = ''
        for cand in candidates:
            cand = cand[0]
            diff = abs(len_of_query - len(cand))
            # print(cand, d, diff)
            if d == -1:
                d = diff
                fit_cand = cand
            elif d > diff:
                d = diff
                fit_cand = cand
            else:
                continue
        return fit_cand, np.max(ratio_scores)

In [None]:
def get_lcs(ner_name: str, 
            unique_entities: np.ndarray) -> Tuple[str, float]:
    """Поиск в списке наиболее близкого слова к извлеченной сущности
    на основе метода наибольшей общей подпоследовательности. 
    """
    if ner_name in unique_entities:
        return ner_name, 1
    lcs_scores = np.array(pylcs.lcs_of_list(ner_name, unique_entities))
    if len(np.argwhere(lcs_scores == np.max(lcs_scores))) <= 1:
        return unique_entities[np.argmax(lcs_scores)], np.max(lcs_scores)
    else:
        candidates = unique_entities[np.argwhere(lcs_scores == np.max(lcs_scores))]
        len_of_query = len(ner_name)
        d = -1
        fit_cand = ''
        for cand in candidates:
            cand = cand[0]
            diff = abs(len_of_query - len(cand))
            # print(cand, d, diff)
            if d == -1:
                d = diff
                fit_cand = cand
            elif d > diff:
                d = diff
                fit_cand = cand
            else:
                continue
        return fit_cand, np.max(lcs_scores)

In [24]:
# # Проверка работы
# ratio_scores = []
# for ent in unique_products:
#     ratio_scores.append(ratio('ox app suite', ent))
# ratio_scores = np.array(ratio_scores)
# print(sorted(ratio_scores, reverse=True)[:5])
# n = list(np.argwhere(ratio_scores >= 0.7).reshape(1, -1))
# unique_products[n]

In [25]:
def retrieve_top_k(ner_name: str, 
                   unique_entities: np.ndarray, 
                   top_k: int = 5):
    """Находит топ k слов из предложенного списка
    на основе метода наибольшей общей подпоследовательности. 
    """
    lcs_scores = np.array(pylcs.lcs_of_list(ner_name, unique_entities))
    l = [(name, round(score/len(name), 4)) for name, score in zip(unique_entities, lcs_scores)]
    ll = sorted(l, key=lambda x: x[1], reverse=True)[:top_k]
    return list(zip(*ll))[0]

Сравнение качества данных алгоритмов

1. Левенштейн

In [26]:
matched_db_product_lev = []
matched_db_vendor_lev = []
score_lev = []
for pr in tqdm(product_ner):
    if pr:
        # print(f'Product NER: {pr}')
        (prod, score)= get_ratio(pr, unique_products)
        # print(f'Found product in DB: {prod}')
        # print(f'score: {score}', end='\n\n')
        df_found_prod = get_df_from_bd(f"select * from cpes where product = '{prod}' limit 1;")
        matched_db_product_lev.append(df_found_prod['product'].values[0])
        matched_db_vendor_lev.append(df_found_prod['vendor'].values[0])
        score_lev.append(score)
        
    else:
        matched_db_product_lev.append('')
        matched_db_vendor_lev.append('')
        score_lev.append(0)

100%|██████████| 200/200 [00:07<00:00, 25.86it/s]


2. НОП

In [27]:
matched_db_product_lcs = []
matched_db_vendor_lcs = []
for pr in tqdm(product_ner):
    if pr:
        # print(f'Product NER: {pr}')
        (prod, score)= get_lcs(pr, unique_products)
        # print(f'Found product in DB: {prod}')
        # print(f'score: {score}', end='\n\n')
        df_found_prod = get_df_from_bd(f"select * from cpes where product = '{prod}' limit 1;")
        matched_db_vendor_lcs.append(df_found_prod['vendor'].values[0])
        matched_db_product_lcs.append(df_found_prod['product'].values[0])
    else:
        matched_db_product_lcs.append('')
        matched_db_vendor_lcs.append('')

100%|██████████| 200/200 [00:14<00:00, 13.67it/s]


3. Кастомный

In [28]:
matched_db_product_adv = []
matched_db_vendor_adv = []
score_lev = []
for pr in tqdm(product_ner):
    if pr:
        # print(f'Product NER: {pr}')
        (prod, score)= get_lcs(pr, unique_products)
        # print(f'Found product in DB: {prod}')
        df_all = get_df_from_bd(f"select distinct vendor, product from cpes where vendor in (select vendor from cpes where product = '{prod}')")
        found_candidates = retrieve_top_k(prod, df_all['product'].tolist(), top_k=3)
        matched_db_product_adv.append(found_candidates)
        matched_db_vendor_adv.append(df_all['vendor'].values[0])
        score_lev.append(score)
        
    else:
        matched_db_product_adv.append([''])
        matched_db_vendor_adv.append('')
        score_lev.append(0)

100%|██████████| 200/200 [00:21<00:00,  9.16it/s]


In [29]:
df_test['matched_db_product_lev'] = matched_db_product_lev
df_test['matched_db_vendor_lev'] = matched_db_vendor_lev
df_test['score_lev'] = score_lev

df_test['matched_db_product_lcs'] = matched_db_product_lcs
df_test['matched_db_vendor_lcs'] = matched_db_vendor_lcs

df_test['matched_db_vendor_adv'] = matched_db_vendor_adv
df_test['matched_db_vendor_adv'] = df_test['matched_db_vendor_adv'].astype(str)
df_test['matched_db_product_adv'] = matched_db_product_adv
df_test['matched_db_product_adv'] = df_test['matched_db_product_adv'].astype(str)

Оценка точности

In [30]:
print(df_test[(df_test['matched_db_product_lev'] == df_test['product'])].shape[0] / 200)
print(df_test[(df_test['matched_db_product_lcs'] == df_test['product'])].shape[0] / 200)
print(df_test.apply(lambda x: x['product'] in x['matched_db_product_adv'], axis=1).sum() / 200)

0.545
0.52
0.58


## Итоговая оценка пайплайна

In [58]:
df_test[(df_test.apply(lambda x: x['vendor'] in x['matched_db_vendor_adv'], axis=1))]

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted,matched_db_product_lev,matched_db_vendor_lev,score_lev,matched_db_product_lcs,matched_db_vendor_lcs,matched_db_vendor_adv,matched_db_product_adv
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1,mp3gain,glensawyer,1,mp3gain,glensawyer,glensawyer,"('mp3gain', 'mp3gain')"
1,CVE-2014-7221,722762,teamspeak,teamspeak3,3.0.7.1,TeamSpeak Client 3.0.14 and earlier allows remote authenticated users to cause a denial of service (buffer overflow and application crash) by connecting to a channel with a different client instan...,cpe:2.3:a:teamspeak:teamspeak3:3.0.7.1:*:*:*:client:*:*:*,1,0,"[[], [teamspeak], [3.0.14 and earlier]]","[[], [0.9999192], [0.9999383]]",[],[teamspeak],[3.0.14 and earlier],[],[0.9999192],[0.9999383],[],[],[teamspeak],[0.9999192],0,teamspeak,teamspeak,1,teamspeak,teamspeak,teamspeak,"('teamspeak', 'teamspeak3', 'teamspeak_client')"
6,CVE-2019-13183,689916,flarum,flarum,0.1.0,"Flarum before 0.1.0-beta.9 allows CSRF against all POST endpoints, as demonstrated by changing admin settings.",cpe:2.3:a:flarum:flarum:0.1.0:beta8.1:*:*:*:*:*:*,1,1,"[[], [flarum], [before 0.1.0-beta.9]]","[[], [0.9974808], [0.99994767]]",[],[flarum],[before 0.1.0-beta.9],[],[0.9974808],[0.99994767],[],[],[flarum],[0.9974808],1,flarum,flarum,1,flarum,flarum,flarum,"('flarum', 'sticky')"
8,CVE-2013-2175,549900,haproxy,haproxy,1.4.17,"HAProxy 1.4 before 1.4.24 and 1.5 before 1.5-dev19, when configured to use hdr_ip or other ""hdr_*"" functions with a negative occurrence count, allows remote attackers to cause a denial of service ...",cpe:2.3:a:haproxy:haproxy:1.4.17:*:*:*:*:*:*:*,1,1,"[[], [haproxy], [1.4 before 1.4.24, 1.5 before 1.5-dev19,]]","[[], [0.9999064], [0.9999666, 0.9999676]]",[],[haproxy],"[1.4 before 1.4.24, 1.5 before 1.5-dev19,]",[],[0.9999064],"[0.9999666, 0.9999676]",[],[],[haproxy],[0.9999064],0,haproxy,netgate,1,haproxy,netgate,haproxy,"('haproxy', 'haproxy', 'proxyprotocol')"
9,CVE-2016-10714,422757,zsh,zsh,4.2.2,"In zsh before 5.3, an off-by-one error resulted in undersized buffers that were intended to support PATH_MAX characters.",cpe:2.3:a:zsh:zsh:4.2.2:*:*:*:*:*:*:*,1,1,"[[], [zsh], [before 5.3,]]","[[], [0.9995414], [0.99995625]]",[],[zsh],"[before 5.3,]",[],[0.9995414],[0.99995625],[],[],[zsh],[0.9995414],1,zsh,zsh,1,zsh,zsh,zsh,"('zsh', 'zsh')"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,CVE-2023-35163,354128,gobalsky,vega,0.33.1,"Vega is a decentralized trading platform that allows pseudo-anonymous trading of derivatives on a blockchain. Prior to version 0.71.6, a vulnerability exists that allows a malicious validator to t...",cpe:2.3:a:gobalsky:vega:0.33.1:*:*:*:*:*:*:*,0,1,"[[], [vega, vega, vega,, vega], [prior to version 0.71.6,]]","[[], [0.99370885, 0.8913145, 0.85312486, 0.9591649], [0.99977887]]",[],"[vega, vega, vega,, vega]","[prior to version 0.71.6,]",[],"[0.99370885, 0.8913145, 0.85312486, 0.9591649]",[0.99977887],[],[],[vega],[0.99370885],0,vega,vega_project,1,vega,vega_project,gobalsky,"('vega', 'vega')"
193,CVE-2016-5876,437832,owncloud,owncloud,9.0.0,"ownCloud server before 8.2.6 and 9.x before 9.0.3, when the gallery app is enabled, allows remote attackers to download arbitrary images via a direct request.",cpe:2.3:a:owncloud:owncloud:9.0.0:*:*:*:*:*:*:*,1,1,"[[], [owncloud], [before 8.2.6, 9.x before 9.0.3,]]","[[], [0.99991], [0.99996156, 0.9999661]]",[],[owncloud],"[before 8.2.6, 9.x before 9.0.3,]",[],[0.99991],"[0.99996156, 0.9999661]",[],[],[owncloud],[0.99991],0,owncloud,owncloud,1,owncloud,owncloud,owncloud,"('owncloud', 'owncloud_desktop_client', 'oauth2')"
194,CVE-2021-24472,607645,qantumthemes,onair2,3.9.3,"The OnAir2 WordPress theme before 3.9.9.2 and QT KenthaRadio WordPress plugin before 2.0.2 have exposed proxy functionality to unauthenticated users, sending requests to this proxy functionality w...",cpe:2.3:a:qantumthemes:onair2:3.9.3:*:*:*:*:wordpress:*:*,0,1,"[[onair2, qt], [wordpress, kentharadio, wordpress], [before 3.9.9.2, before 2.0.2]]","[[0.73382086, 0.8601528], [0.9012168, 0.99422354, 0.9187399], [0.9999491, 0.9999509]]","[onair2, qt]","[wordpress, kentharadio, wordpress]","[before 3.9.9.2, before 2.0.2]","[0.73382086, 0.8601528]","[0.9012168, 0.99422354, 0.9187399]","[0.9999491, 0.9999509]",[qt],[0.8601528],[kentharadio],[0.99422354],1,kentharadio,qantumthemes,1,kentharadio,qantumthemes,qantumthemes,"('kentharadio', 'onair2')"
196,CVE-2020-11673,451620,total-soft,responsive_poll,1.2.2,"An issue was discovered in the Responsive Poll through 1.3.4 for Wordpress. It allows an unauthenticated user to manipulate polls, e.g., delete, clone, or view a hidden poll. This is due to the us...",cpe:2.3:a:total-soft:responsive_poll:1.2.2:*:*:*:*:wordpress:*:*,1,0,"[[], [responsive poll], [through 1.3.4]]","[[], [0.9997986], [0.9999593]]",[],[responsive poll],[through 1.3.4],[],[0.9997986],[0.9999593],[],[],[responsive poll],[0.9997986],1,responsive_poll,total-soft,14,responsive_poll,total-soft,total-soft,"('responsive_poll', 'video_gallery', 'event_calendar')"


In [66]:
61/200

0.305

In [59]:
df_test[(df_test.apply(lambda x: x['product'] in x['matched_db_product_adv'], axis=1))]

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted,matched_db_product_lev,matched_db_vendor_lev,score_lev,matched_db_product_lcs,matched_db_vendor_lcs,matched_db_vendor_adv,matched_db_product_adv
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1,mp3gain,glensawyer,1,mp3gain,glensawyer,glensawyer,"('mp3gain', 'mp3gain')"
1,CVE-2014-7221,722762,teamspeak,teamspeak3,3.0.7.1,TeamSpeak Client 3.0.14 and earlier allows remote authenticated users to cause a denial of service (buffer overflow and application crash) by connecting to a channel with a different client instan...,cpe:2.3:a:teamspeak:teamspeak3:3.0.7.1:*:*:*:client:*:*:*,1,0,"[[], [teamspeak], [3.0.14 and earlier]]","[[], [0.9999192], [0.9999383]]",[],[teamspeak],[3.0.14 and earlier],[],[0.9999192],[0.9999383],[],[],[teamspeak],[0.9999192],0,teamspeak,teamspeak,1,teamspeak,teamspeak,teamspeak,"('teamspeak', 'teamspeak3', 'teamspeak_client')"
6,CVE-2019-13183,689916,flarum,flarum,0.1.0,"Flarum before 0.1.0-beta.9 allows CSRF against all POST endpoints, as demonstrated by changing admin settings.",cpe:2.3:a:flarum:flarum:0.1.0:beta8.1:*:*:*:*:*:*,1,1,"[[], [flarum], [before 0.1.0-beta.9]]","[[], [0.9974808], [0.99994767]]",[],[flarum],[before 0.1.0-beta.9],[],[0.9974808],[0.99994767],[],[],[flarum],[0.9974808],1,flarum,flarum,1,flarum,flarum,flarum,"('flarum', 'sticky')"
8,CVE-2013-2175,549900,haproxy,haproxy,1.4.17,"HAProxy 1.4 before 1.4.24 and 1.5 before 1.5-dev19, when configured to use hdr_ip or other ""hdr_*"" functions with a negative occurrence count, allows remote attackers to cause a denial of service ...",cpe:2.3:a:haproxy:haproxy:1.4.17:*:*:*:*:*:*:*,1,1,"[[], [haproxy], [1.4 before 1.4.24, 1.5 before 1.5-dev19,]]","[[], [0.9999064], [0.9999666, 0.9999676]]",[],[haproxy],"[1.4 before 1.4.24, 1.5 before 1.5-dev19,]",[],[0.9999064],"[0.9999666, 0.9999676]",[],[],[haproxy],[0.9999064],0,haproxy,netgate,1,haproxy,netgate,haproxy,"('haproxy', 'haproxy', 'proxyprotocol')"
9,CVE-2016-10714,422757,zsh,zsh,4.2.2,"In zsh before 5.3, an off-by-one error resulted in undersized buffers that were intended to support PATH_MAX characters.",cpe:2.3:a:zsh:zsh:4.2.2:*:*:*:*:*:*:*,1,1,"[[], [zsh], [before 5.3,]]","[[], [0.9995414], [0.99995625]]",[],[zsh],"[before 5.3,]",[],[0.9995414],[0.99995625],[],[],[zsh],[0.9995414],1,zsh,zsh,1,zsh,zsh,zsh,"('zsh', 'zsh')"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,CVE-2023-35163,354128,gobalsky,vega,0.33.1,"Vega is a decentralized trading platform that allows pseudo-anonymous trading of derivatives on a blockchain. Prior to version 0.71.6, a vulnerability exists that allows a malicious validator to t...",cpe:2.3:a:gobalsky:vega:0.33.1:*:*:*:*:*:*:*,0,1,"[[], [vega, vega, vega,, vega], [prior to version 0.71.6,]]","[[], [0.99370885, 0.8913145, 0.85312486, 0.9591649], [0.99977887]]",[],"[vega, vega, vega,, vega]","[prior to version 0.71.6,]",[],"[0.99370885, 0.8913145, 0.85312486, 0.9591649]",[0.99977887],[],[],[vega],[0.99370885],0,vega,vega_project,1,vega,vega_project,gobalsky,"('vega', 'vega')"
193,CVE-2016-5876,437832,owncloud,owncloud,9.0.0,"ownCloud server before 8.2.6 and 9.x before 9.0.3, when the gallery app is enabled, allows remote attackers to download arbitrary images via a direct request.",cpe:2.3:a:owncloud:owncloud:9.0.0:*:*:*:*:*:*:*,1,1,"[[], [owncloud], [before 8.2.6, 9.x before 9.0.3,]]","[[], [0.99991], [0.99996156, 0.9999661]]",[],[owncloud],"[before 8.2.6, 9.x before 9.0.3,]",[],[0.99991],"[0.99996156, 0.9999661]",[],[],[owncloud],[0.99991],0,owncloud,owncloud,1,owncloud,owncloud,owncloud,"('owncloud', 'owncloud_desktop_client', 'oauth2')"
194,CVE-2021-24472,607645,qantumthemes,onair2,3.9.3,"The OnAir2 WordPress theme before 3.9.9.2 and QT KenthaRadio WordPress plugin before 2.0.2 have exposed proxy functionality to unauthenticated users, sending requests to this proxy functionality w...",cpe:2.3:a:qantumthemes:onair2:3.9.3:*:*:*:*:wordpress:*:*,0,1,"[[onair2, qt], [wordpress, kentharadio, wordpress], [before 3.9.9.2, before 2.0.2]]","[[0.73382086, 0.8601528], [0.9012168, 0.99422354, 0.9187399], [0.9999491, 0.9999509]]","[onair2, qt]","[wordpress, kentharadio, wordpress]","[before 3.9.9.2, before 2.0.2]","[0.73382086, 0.8601528]","[0.9012168, 0.99422354, 0.9187399]","[0.9999491, 0.9999509]",[qt],[0.8601528],[kentharadio],[0.99422354],1,kentharadio,qantumthemes,1,kentharadio,qantumthemes,qantumthemes,"('kentharadio', 'onair2')"
196,CVE-2020-11673,451620,total-soft,responsive_poll,1.2.2,"An issue was discovered in the Responsive Poll through 1.3.4 for Wordpress. It allows an unauthenticated user to manipulate polls, e.g., delete, clone, or view a hidden poll. This is due to the us...",cpe:2.3:a:total-soft:responsive_poll:1.2.2:*:*:*:*:wordpress:*:*,1,0,"[[], [responsive poll], [through 1.3.4]]","[[], [0.9997986], [0.9999593]]",[],[responsive poll],[through 1.3.4],[],[0.9997986],[0.9999593],[],[],[responsive poll],[0.9997986],1,responsive_poll,total-soft,14,responsive_poll,total-soft,total-soft,"('responsive_poll', 'video_gallery', 'event_calendar')"


In [60]:
df_test[df_test.true_version_in_predicted == 1]

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted,matched_db_product_lev,matched_db_vendor_lev,score_lev,matched_db_product_lcs,matched_db_vendor_lcs,matched_db_vendor_adv,matched_db_product_adv
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1,mp3gain,glensawyer,1,mp3gain,glensawyer,glensawyer,"('mp3gain', 'mp3gain')"
6,CVE-2019-13183,689916,flarum,flarum,0.1.0,"Flarum before 0.1.0-beta.9 allows CSRF against all POST endpoints, as demonstrated by changing admin settings.",cpe:2.3:a:flarum:flarum:0.1.0:beta8.1:*:*:*:*:*:*,1,1,"[[], [flarum], [before 0.1.0-beta.9]]","[[], [0.9974808], [0.99994767]]",[],[flarum],[before 0.1.0-beta.9],[],[0.9974808],[0.99994767],[],[],[flarum],[0.9974808],1,flarum,flarum,1,flarum,flarum,flarum,"('flarum', 'sticky')"
9,CVE-2016-10714,422757,zsh,zsh,4.2.2,"In zsh before 5.3, an off-by-one error resulted in undersized buffers that were intended to support PATH_MAX characters.",cpe:2.3:a:zsh:zsh:4.2.2:*:*:*:*:*:*:*,1,1,"[[], [zsh], [before 5.3,]]","[[], [0.9995414], [0.99995625]]",[],[zsh],"[before 5.3,]",[],[0.9995414],[0.99995625],[],[],[zsh],[0.9995414],1,zsh,zsh,1,zsh,zsh,zsh,"('zsh', 'zsh')"
12,CVE-2022-3768,178195,wpsmartcontracts,wpsmartcontracts,1.2.2,"The WPSmartContracts WordPress plugin before 1.3.12 does not properly sanitise and escape a parameter before using it in a SQL statement, leading to a SQL injection exploitable by users with a rol...",cpe:2.3:a:wpsmartcontracts:wpsmartcontracts:1.2.2:*:*:*:*:wordpress:*:*,1,1,"[[], [wpsmartcontracts, wordpress], [before 1.3.12]]","[[], [0.97421944, 0.99314076], [0.9999514]]",[],"[wpsmartcontracts, wordpress]",[before 1.3.12],[],"[0.97421944, 0.99314076]",[0.9999514],[],[],[wordpress],[0.99314076],1,wordpress,wordpress,1,wordpress,wordpress,wordpress,"('wordpress', 'wordpress_mu', 'wordspew')"
14,CVE-2015-8076,582232,cyrus,imap,2.5.2,"The index_urlfetch function in index.c in Cyrus IMAP 2.3.x before 2.3.19, 2.4.x before 2.4.18, 2.5.x before 2.5.4 allows remote attackers to obtain sensitive information or possibly have unspecifi...",cpe:2.3:a:cyrus:imap:2.5.2:*:*:*:*:*:*:*,1,1,"[[], [cyrus imap], [2.3.x before 2.3.19,, 2.4.x before 2.4.18,, 2.5.x before 2.5.4]]","[[], [0.9998886], [0.9999668, 0.99996823, 0.9999669]]",[],[cyrus imap],"[2.3.x before 2.3.19,, 2.4.x before 2.4.18,, 2.5.x before 2.5.4]",[],[0.9998886],"[0.9999668, 0.99996823, 0.9999669]",[],[],[cyrus imap],[0.9998886],1,cyrus_imap,cyrusimap,9,cyrus_imap,cyrusimap,cyrusimap,"('cyrus_imap', 'cyrus_sasl', 'cyrus-sasl')"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,CVE-2021-33176,627381,octavolabs,vernemq,1.0.0,VerneMQ MQTT Broker versions prior to 1.12.0 are vulnerable to a denial of service attack as a result of excessive memory consumption due to the handling of untrusted inputs. These inputs cause th...,cpe:2.3:a:octavolabs:vernemq:1.0.0:-:*:*:*:*:*:*,0,1,"[[vernemq], [mqtt broker], [prior to 1.12.0]]","[[0.99925786], [0.99971294], [0.99991876]]",[vernemq],[mqtt broker],[prior to 1.12.0],[0.99925786],[0.99971294],[0.99991876],[vernemq],[0.99925786],[mqtt broker],[0.99971294],1,amq_broker,redhat,9,small_footprint_cim_broker,chris_buccella,chris_buccella,"('small_footprint_cim_broker', 'small_footprint_cim_broker')"
194,CVE-2021-24472,607645,qantumthemes,onair2,3.9.3,"The OnAir2 WordPress theme before 3.9.9.2 and QT KenthaRadio WordPress plugin before 2.0.2 have exposed proxy functionality to unauthenticated users, sending requests to this proxy functionality w...",cpe:2.3:a:qantumthemes:onair2:3.9.3:*:*:*:*:wordpress:*:*,0,1,"[[onair2, qt], [wordpress, kentharadio, wordpress], [before 3.9.9.2, before 2.0.2]]","[[0.73382086, 0.8601528], [0.9012168, 0.99422354, 0.9187399], [0.9999491, 0.9999509]]","[onair2, qt]","[wordpress, kentharadio, wordpress]","[before 3.9.9.2, before 2.0.2]","[0.73382086, 0.8601528]","[0.9012168, 0.99422354, 0.9187399]","[0.9999491, 0.9999509]",[qt],[0.8601528],[kentharadio],[0.99422354],1,kentharadio,qantumthemes,1,kentharadio,qantumthemes,qantumthemes,"('kentharadio', 'onair2')"
196,CVE-2020-11673,451620,total-soft,responsive_poll,1.2.2,"An issue was discovered in the Responsive Poll through 1.3.4 for Wordpress. It allows an unauthenticated user to manipulate polls, e.g., delete, clone, or view a hidden poll. This is due to the us...",cpe:2.3:a:total-soft:responsive_poll:1.2.2:*:*:*:*:wordpress:*:*,1,0,"[[], [responsive poll], [through 1.3.4]]","[[], [0.9997986], [0.9999593]]",[],[responsive poll],[through 1.3.4],[],[0.9997986],[0.9999593],[],[],[responsive poll],[0.9997986],1,responsive_poll,total-soft,14,responsive_poll,total-soft,total-soft,"('responsive_poll', 'video_gallery', 'event_calendar')"
197,CVE-2022-1253,80875,struktur,libde265,1.0.3,Heap-based Buffer Overflow in GitHub repository strukturag/libde265 prior to and including 1.0.8. The fix is established in commit 8e89fe0e175d2870c39486fdd09250b230ec10b8 but does not yet belong ...,cpe:2.3:a:struktur:libde265:1.0.3:*:*:*:*:*:*:*,1,1,"[[], [], [prior to and including 1.0.8.]]","[[], [], [0.99994123]]",[],[],[prior to and including 1.0.8.],[],[],[0.99994123],[],[],[],[],1,,,0,,,,['']


In [57]:
df_test[(df_test.apply(lambda x: x['vendor'] in x['matched_db_vendor_adv'], axis=1)) &
        (df_test.apply(lambda x: x['product'] in x['matched_db_product_adv'], axis=1))]

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted,matched_db_product_lev,matched_db_vendor_lev,score_lev,matched_db_product_lcs,matched_db_vendor_lcs,matched_db_vendor_adv,matched_db_product_adv
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1,mp3gain,glensawyer,1,mp3gain,glensawyer,glensawyer,"('mp3gain', 'mp3gain')"
1,CVE-2014-7221,722762,teamspeak,teamspeak3,3.0.7.1,TeamSpeak Client 3.0.14 and earlier allows remote authenticated users to cause a denial of service (buffer overflow and application crash) by connecting to a channel with a different client instan...,cpe:2.3:a:teamspeak:teamspeak3:3.0.7.1:*:*:*:client:*:*:*,1,0,"[[], [teamspeak], [3.0.14 and earlier]]","[[], [0.9999192], [0.9999383]]",[],[teamspeak],[3.0.14 and earlier],[],[0.9999192],[0.9999383],[],[],[teamspeak],[0.9999192],0,teamspeak,teamspeak,1,teamspeak,teamspeak,teamspeak,"('teamspeak', 'teamspeak3', 'teamspeak_client')"
6,CVE-2019-13183,689916,flarum,flarum,0.1.0,"Flarum before 0.1.0-beta.9 allows CSRF against all POST endpoints, as demonstrated by changing admin settings.",cpe:2.3:a:flarum:flarum:0.1.0:beta8.1:*:*:*:*:*:*,1,1,"[[], [flarum], [before 0.1.0-beta.9]]","[[], [0.9974808], [0.99994767]]",[],[flarum],[before 0.1.0-beta.9],[],[0.9974808],[0.99994767],[],[],[flarum],[0.9974808],1,flarum,flarum,1,flarum,flarum,flarum,"('flarum', 'sticky')"
8,CVE-2013-2175,549900,haproxy,haproxy,1.4.17,"HAProxy 1.4 before 1.4.24 and 1.5 before 1.5-dev19, when configured to use hdr_ip or other ""hdr_*"" functions with a negative occurrence count, allows remote attackers to cause a denial of service ...",cpe:2.3:a:haproxy:haproxy:1.4.17:*:*:*:*:*:*:*,1,1,"[[], [haproxy], [1.4 before 1.4.24, 1.5 before 1.5-dev19,]]","[[], [0.9999064], [0.9999666, 0.9999676]]",[],[haproxy],"[1.4 before 1.4.24, 1.5 before 1.5-dev19,]",[],[0.9999064],"[0.9999666, 0.9999676]",[],[],[haproxy],[0.9999064],0,haproxy,netgate,1,haproxy,netgate,haproxy,"('haproxy', 'haproxy', 'proxyprotocol')"
9,CVE-2016-10714,422757,zsh,zsh,4.2.2,"In zsh before 5.3, an off-by-one error resulted in undersized buffers that were intended to support PATH_MAX characters.",cpe:2.3:a:zsh:zsh:4.2.2:*:*:*:*:*:*:*,1,1,"[[], [zsh], [before 5.3,]]","[[], [0.9995414], [0.99995625]]",[],[zsh],"[before 5.3,]",[],[0.9995414],[0.99995625],[],[],[zsh],[0.9995414],1,zsh,zsh,1,zsh,zsh,zsh,"('zsh', 'zsh')"
10,CVE-2023-4393,375964,liquidfiles,liquidfiles,1.6.23,"HTML and SMTP injections on the registration page of LiquidFiles versions 3.7.13 and below, allow an attacker to perform more advanced phishing attacks against an organization.",cpe:2.3:a:liquidfiles:liquidfiles:1.6.23:*:*:*:*:*:*:*,1,1,"[[], [liquidfiles], [3.7.13 and below,]]","[[], [0.99991], [0.83767396]]",[],[liquidfiles],"[3.7.13 and below,]",[],[0.99991],[0.83767396],[],[],[liquidfiles],[0.99991],0,liquidfiles,liquidfiles,1,liquidfiles,liquidfiles,liquidfiles,"('liquidfiles',)"
13,CVE-2014-4700,719860,citrix,xendesktop,4.0,"Citrix XenDesktop 7.x, 5.x, and 4.x, when pooled random desktop groups is enabled and ShutdownDesktopsAfterUse is disabled, allows local guest users to gain access to another user's desktop via un...",cpe:2.3:a:citrix:xendesktop:4.0:*:*:*:*:*:*:*,1,1,"[[citrix], [xendesktop], [7.x,, 5.x,, 4.x,]]","[[0.999882], [0.9998907], [0.9999678, 0.9999691, 0.9999697]]",[citrix],[xendesktop],"[7.x,, 5.x,, 4.x,]",[0.999882],[0.9998907],"[0.9999678, 0.9999691, 0.9999697]",[citrix],[0.999882],[xendesktop],[0.9998907],0,xendesktop,citrix,1,xendesktop,citrix,citrix,"('xen', 'xendesktop', 'xp')"
14,CVE-2015-8076,582232,cyrus,imap,2.5.2,"The index_urlfetch function in index.c in Cyrus IMAP 2.3.x before 2.3.19, 2.4.x before 2.4.18, 2.5.x before 2.5.4 allows remote attackers to obtain sensitive information or possibly have unspecifi...",cpe:2.3:a:cyrus:imap:2.5.2:*:*:*:*:*:*:*,1,1,"[[], [cyrus imap], [2.3.x before 2.3.19,, 2.4.x before 2.4.18,, 2.5.x before 2.5.4]]","[[], [0.9998886], [0.9999668, 0.99996823, 0.9999669]]",[],[cyrus imap],"[2.3.x before 2.3.19,, 2.4.x before 2.4.18,, 2.5.x before 2.5.4]",[],[0.9998886],"[0.9999668, 0.99996823, 0.9999669]",[],[],[cyrus imap],[0.9998886],1,cyrus_imap,cyrusimap,9,cyrus_imap,cyrusimap,cyrusimap,"('cyrus_imap', 'cyrus_sasl', 'cyrus-sasl')"
15,CVE-2019-14862,692470,knockoutjs,knockout,1.2.1,"There is a vulnerability in knockout before version 3.5.0-beta, where after escaping the context of the web application, the web application delivers data to its users along with other trusted dyn...",cpe:2.3:a:knockoutjs:knockout:1.2.1:*:*:*:*:*:*:*,0,1,"[[], [knockout], [before version 3.5.0-beta,]]","[[], [0.9442648], [0.9837834]]",[],[knockout],"[before version 3.5.0-beta,]",[],[0.9442648],[0.9837834],[],[],[knockout],[0.9442648],1,knockout,knockoutjs,1,knockout,knockoutjs,knockoutjs,"('knockout',)"
19,CVE-2023-23723,310214,winwar,wp_email_capture,3.7.1,Auth. (admin+) Stored Cross-Site Scripting (XSS) vulnerability in Winwar Media WP Email Capture plugin <= 3.9.3 versions.,cpe:2.3:a:winwar:wp_email_capture:3.7.1:*:*:*:*:wordpress:*:*,1,0,"[[], [wp email capture], [<= 3.9.3]]","[[], [0.99982667], [0.99995184]]",[],[wp email capture],[<= 3.9.3],[],[0.99982667],[0.99995184],[],[],[wp email capture],[0.99982667],1,wp_email_capture,winwar,14,wp_email_capture,winwar,winwar,"('wp_email_capture', 'wp_flipclock', 'wp_ebay_product_feeds')"


In [62]:
df_test[(df_test.apply(lambda x: x['vendor'] in x['matched_db_vendor_adv'], axis=1)) &
        (df_test.apply(lambda x: x['product'] in x['matched_db_product_adv'], axis=1)) &
        (df_test.true_version_in_predicted == 1)]

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted,matched_db_product_lev,matched_db_vendor_lev,score_lev,matched_db_product_lcs,matched_db_vendor_lcs,matched_db_vendor_adv,matched_db_product_adv
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1,mp3gain,glensawyer,1,mp3gain,glensawyer,glensawyer,"('mp3gain', 'mp3gain')"
6,CVE-2019-13183,689916,flarum,flarum,0.1.0,"Flarum before 0.1.0-beta.9 allows CSRF against all POST endpoints, as demonstrated by changing admin settings.",cpe:2.3:a:flarum:flarum:0.1.0:beta8.1:*:*:*:*:*:*,1,1,"[[], [flarum], [before 0.1.0-beta.9]]","[[], [0.9974808], [0.99994767]]",[],[flarum],[before 0.1.0-beta.9],[],[0.9974808],[0.99994767],[],[],[flarum],[0.9974808],1,flarum,flarum,1,flarum,flarum,flarum,"('flarum', 'sticky')"
9,CVE-2016-10714,422757,zsh,zsh,4.2.2,"In zsh before 5.3, an off-by-one error resulted in undersized buffers that were intended to support PATH_MAX characters.",cpe:2.3:a:zsh:zsh:4.2.2:*:*:*:*:*:*:*,1,1,"[[], [zsh], [before 5.3,]]","[[], [0.9995414], [0.99995625]]",[],[zsh],"[before 5.3,]",[],[0.9995414],[0.99995625],[],[],[zsh],[0.9995414],1,zsh,zsh,1,zsh,zsh,zsh,"('zsh', 'zsh')"
14,CVE-2015-8076,582232,cyrus,imap,2.5.2,"The index_urlfetch function in index.c in Cyrus IMAP 2.3.x before 2.3.19, 2.4.x before 2.4.18, 2.5.x before 2.5.4 allows remote attackers to obtain sensitive information or possibly have unspecifi...",cpe:2.3:a:cyrus:imap:2.5.2:*:*:*:*:*:*:*,1,1,"[[], [cyrus imap], [2.3.x before 2.3.19,, 2.4.x before 2.4.18,, 2.5.x before 2.5.4]]","[[], [0.9998886], [0.9999668, 0.99996823, 0.9999669]]",[],[cyrus imap],"[2.3.x before 2.3.19,, 2.4.x before 2.4.18,, 2.5.x before 2.5.4]",[],[0.9998886],"[0.9999668, 0.99996823, 0.9999669]",[],[],[cyrus imap],[0.9998886],1,cyrus_imap,cyrusimap,9,cyrus_imap,cyrusimap,cyrusimap,"('cyrus_imap', 'cyrus_sasl', 'cyrus-sasl')"
15,CVE-2019-14862,692470,knockoutjs,knockout,1.2.1,"There is a vulnerability in knockout before version 3.5.0-beta, where after escaping the context of the web application, the web application delivers data to its users along with other trusted dyn...",cpe:2.3:a:knockoutjs:knockout:1.2.1:*:*:*:*:*:*:*,0,1,"[[], [knockout], [before version 3.5.0-beta,]]","[[], [0.9442648], [0.9837834]]",[],[knockout],"[before version 3.5.0-beta,]",[],[0.9442648],[0.9837834],[],[],[knockout],[0.9442648],1,knockout,knockoutjs,1,knockout,knockoutjs,knockoutjs,"('knockout',)"
19,CVE-2023-23723,310214,winwar,wp_email_capture,3.7.1,Auth. (admin+) Stored Cross-Site Scripting (XSS) vulnerability in Winwar Media WP Email Capture plugin <= 3.9.3 versions.,cpe:2.3:a:winwar:wp_email_capture:3.7.1:*:*:*:*:wordpress:*:*,1,0,"[[], [wp email capture], [<= 3.9.3]]","[[], [0.99982667], [0.99995184]]",[],[wp email capture],[<= 3.9.3],[],[0.99982667],[0.99995184],[],[],[wp email capture],[0.99982667],1,wp_email_capture,winwar,14,wp_email_capture,winwar,winwar,"('wp_email_capture', 'wp_flipclock', 'wp_ebay_product_feeds')"
20,CVE-2023-52323,406933,pycryptodome,pycryptodome,3.15.0,"PyCryptodome and pycryptodomex before 3.19.1 allow side-channel leakage for OAEP decryption, exploitable for a Manger attack.",cpe:2.3:a:pycryptodome:pycryptodome:3.15.0:*:*:*:*:python:*:*,1,1,"[[], [pycryptodome, pycryptodomex], [before 3.19.1]]","[[], [0.9999124, 0.99988425], [0.9999557]]",[],"[pycryptodome, pycryptodomex]",[before 3.19.1],[],"[0.9999124, 0.99988425]",[0.9999557],[],[],[pycryptodome],[0.9999124],1,pycryptodome,pycryptodome,1,pycryptodome,pycryptodome,pycryptodome,"('pycryptodome', 'pycryptodomex')"
23,CVE-2023-6998,414164,coolkit,ewelink,4.13.1,Improper privilege management vulnerability in CoolKit Technology eWeLink on Android and iOS allows application lockscreen bypass.This issue affects eWeLink before 5.2.0.\n\n,cpe:2.3:a:coolkit:ewelink:4.13.1:*:*:*:*:android:*:*,1,1,"[[coolkit], [ewelink, ewelink], [before 5.2.0.]]","[[0.9544912], [0.9998672, 0.9998958], [0.9999499]]",[coolkit],"[ewelink, ewelink]",[before 5.2.0.],[0.9544912],"[0.9998672, 0.9998958]",[0.9999499],[coolkit],[0.9544912],[ewelink],[0.9998958],1,ewelink,coolkit,1,ewelink,coolkit,coolkit,"('ewelink',)"
24,CVE-2015-9438,587317,display-widgets_project,display-widgets,1.23,"The display-widgets plugin before 2.04 for WordPress has XSS via the wp-admin/admin-ajax.php?action=dw_show_widget id_base, widget_number, or instance parameter.",cpe:2.3:a:display-widgets_project:display-widgets:1.23:*:*:*:*:wordpress:*:*,0,1,"[[], [display-widgets], [before 2.04]]","[[], [0.9999174], [0.9999537]]",[],[display-widgets],[before 2.04],[],[0.9999174],[0.9999537],[],[],[display-widgets],[0.9999174],1,display-widgets,display-widgets_project,1,display-widgets,display-widgets_project,display-widgets_project,"('display-widgets',)"
28,CVE-2020-7981,500979,rubygeocoder,geocoder,1.2.0,"sql.rb in Geocoder before 1.6.1 allows Boolean-based SQL injection when within_bounding_box is used in conjunction with untrusted sw_lat, sw_lng, ne_lat, or ne_lng data.",cpe:2.3:a:rubygeocoder:geocoder:1.2.0:*:*:*:*:*:*:*,0,1,"[[], [geocoder], [before 1.6.1]]","[[], [0.9998869], [0.9999563]]",[],[geocoder],[before 1.6.1],[],[0.9998869],[0.9999563],[],[],[geocoder],[0.9998869],1,geocoder,rubygeocoder,1,geocoder,rubygeocoder,rubygeocoder,"('geocoder',)"


* Для CVE-2007-6487 продукта webgui есть вендор plainblack и plain_black в БД

* Для CVE-2004-0095 есть продукт epolicy_orchestrator_agent и epolicy_orchestrator в БД

* Для CVE-2007-3381 продукт gdm -- абревиатура, не найти такой продукт в БД

* CVE-2013-6440 есть продукт opensaml и opensaml_java в БД

* CVE-2014-7221 есть продукт teamspeak и teamspeak3 в БД

* CVE-2020-15003 дубли продукта в БД open-xchange_appsuite и ox_app_suite, оба версии 7.10.5

* CVE-2020-2090 дубли продукта в БД ec и amazon_ec2, оба версии 1.8

Много ошибок связано с wordpress. Не брать wordpress как продукт?

## Анализ ошибок

### Анализ распознования версий

In [50]:
df_test[(~flag) & (df_test['descr'].str.contains('[W|w]ord[P|p]ress'))]

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted,matched_db_product_lev,matched_db_vendor_lev,score_lev,matched_db_product_lcs,matched_db_vendor_lcs,matched_db_vendor_adv,matched_db_product_adv
12,CVE-2022-3768,178195,wpsmartcontracts,wpsmartcontracts,1.2.2,"The WPSmartContracts WordPress plugin before 1.3.12 does not properly sanitise and escape a parameter before using it in a SQL statement, leading to a SQL injection exploitable by users with a rol...",cpe:2.3:a:wpsmartcontracts:wpsmartcontracts:1.2.2:*:*:*:*:wordpress:*:*,1,1,"[[], [wpsmartcontracts, wordpress], [before 1.3.12]]","[[], [0.97421944, 0.99314076], [0.9999514]]",[],"[wpsmartcontracts, wordpress]",[before 1.3.12],[],"[0.97421944, 0.99314076]",[0.9999514],[],[],[wordpress],[0.99314076],1,wordpress,wordpress,1,wordpress,wordpress,wordpress,"('wordpress', 'wordpress_mu', 'wordspew')"
22,CVE-2022-4578,205752,video_conferencing_with_zoom_project,video_conferencing_with_zoom,3.6.23,"The Video Conferencing with Zoom WordPress plugin before 4.0.10 does not validate and escape some of its shortcode attributes before outputting them back in the page, which could allow users with ...",cpe:2.3:a:video_conferencing_with_zoom_project:video_conferencing_with_zoom:3.6.23:*:*:*:*:wordpress:*:*,0,0,"[[], [zoom wordpress], [before 4.0.10]]","[[], [0.99890494], [0.99996024]]",[],[zoom wordpress],[before 4.0.10],[],[0.99890494],[0.99996024],[],[],[zoom wordpress],[0.99890494],1,wordpress,wordpress,12,zero_spam_for_wordpress,highfivery,highfivery,"('zero_spam_for_wordpress', 'zero-spam')"
26,CVE-2022-23179,107447,themehunk,contact_form_\&_lead_form_elementor_builder,1.5.0,"The Contact Form & Lead Form Elementor Builder WordPress plugin before 1.7.0 does not escape some of its form fields before outputting them in attributes, which could allow high privilege users to...",cpe:2.3:a:themehunk:contact_form_\&_lead_form_elementor_builder:1.5.0:*:*:*:*:wordpress:*:*,0,0,"[[], [elementor builder, wordpress], [before 1.7.0]]","[[], [0.7133889, 0.9590189], [0.9999594]]",[],"[elementor builder, wordpress]",[before 1.7.0],[],"[0.7133889, 0.9590189]",[0.9999594],[],[],[wordpress],[0.9590189],1,wordpress,wordpress,1,wordpress,wordpress,wordpress,"('wordpress', 'wordpress_mu', 'wordspew')"
27,CVE-2022-34868,168232,yookassa,yukassa_for_woocommerce,2.1.4,Authenticated Arbitrary Settings Update vulnerability in YooMoney ?Kassa ??? WooCommerce plugin <= 2.3.0 at WordPress.,cpe:2.3:a:yookassa:yukassa_for_woocommerce:2.1.4:*:*:*:*:wordpress:*:*,0,0,"[[yoomoney], [woocommerce], [<= 2.3.0]]","[[0.9996438], [0.97338337], [0.9999456]]",[yoomoney],[woocommerce],[<= 2.3.0],[0.9996438],[0.97338337],[0.9999456],[yoomoney],[0.9996438],[woocommerce],[0.97338337],1,woocommerce,woocommerce,1,woocommerce,woocommerce,automattic,"('woocommerce', 'woocommerce', 'woocommerce')"
48,CVE-2023-0816,294128,strategy11,formidable_form_builder,4.04.04,"The Formidable Forms WordPress plugin before 6.1 uses several potentially untrusted headers to determine the IP address of the client, leading to IP Address spoofing and bypass of anti-spam protec...",cpe:2.3:a:strategy11:formidable_form_builder:4.04.04:*:*:*:*:wordpress:*:*,0,0,"[[], [formidable forms wordpress], [before 6.1]]","[[], [0.99577445], [0.9999571]]",[],[formidable forms wordpress],[before 6.1],[],[0.99577445],[0.9999571],[],[],[formidable forms wordpress],[0.99577445],0,formidable_forms,strategy11,19,post_form_registration_form_profile_form_for_user_profiles_and_content_forms,themekraft,themekraft,"('post_form', 'post_form_registration_form_profile_form_for_user_profiles_and_content_forms', 'buddyforms')"
73,CVE-2022-0208,63809,mappresspro,mappress,2.49.2,"The MapPress Maps for WordPress plugin before 2.73.4 does not sanitise and escape the mapid parameter before outputting it back in the ""Bad mapid"" error message, leading to a Reflected Cross-Site ...",cpe:2.3:a:mappresspro:mappress:2.49.2:beta:*:*:pro:wordpress:*:*,0,1,"[[], [mappress maps, wordpress], [before 2.73.4]]","[[], [0.8558816, 0.90141535], [0.99994564]]",[],"[mappress maps, wordpress]",[before 2.73.4],[],"[0.8558816, 0.90141535]",[0.99994564],[],[],[wordpress],[0.90141535],1,wordpress,wordpress,1,wordpress,wordpress,wordpress,"('wordpress', 'wordpress_mu', 'wordspew')"
75,CVE-2022-0864,76989,updraftplus,updraftplus,1.9.64,"The UpdraftPlus WordPress Backup Plugin WordPress plugin before 1.22.9 does not sanitise and escape the updraft_interval parameter before outputting it back in an admin page, leading to a Reflecte...",cpe:2.3:a:updraftplus:updraftplus:1.9.64:*:*:*:*:wordpress:*:*,1,1,"[[updraftplus], [wordpress backup, wordpress], [before 1.22.9]]","[[0.9832593], [0.93556273, 0.70108855], [0.99995124]]",[updraftplus],"[wordpress backup, wordpress]",[before 1.22.9],[0.9832593],"[0.93556273, 0.70108855]",[0.99995124],[updraftplus],[0.9832593],[wordpress backup],[0.93556273],1,wordpress,wordpress,15,wordpress_backup_to_dropbox,wordpress_backup_to_dropbox_project,wordpress_backup_to_dropbox_project,"('wordpress_backup_to_dropbox',)"
88,CVE-2022-4623,209033,nicdark,nd_shortcodes,5.3,"The ND Shortcodes WordPress plugin before 7.0 does not validate and escape numerous of its shortcode attributes before outputting them back in a page/post where the shortcode is embed, which could...",cpe:2.3:a:nicdark:nd_shortcodes:5.3:*:*:*:*:wordpress:*:*,0,0,"[[], [nd shortcodes wordpress], [before 7.0]]","[[], [0.9164405], [0.9999592]]",[],[nd shortcodes wordpress],[before 7.0],[],[0.9164405],[0.9999592],[],[],[nd shortcodes wordpress],[0.9164405],1,nd_shortcodes,nicdark,17,password_reset_with_code_for_wordpress_rest_api,bedevious,bedevious,"('password_reset_with_code_for_wordpress_rest_api',)"
109,CVE-2021-24348,606214,wow-estore,side_menu,3.1,"The menu delete functionality of the Side Menu – add fixed side buttons WordPress plugin before 3.1.5, available to Administrator users takes the did GET parameter and uses it into an SQL statemen...",cpe:2.3:a:wow-estore:side_menu:3.1:*:*:*:*:wordpress:*:*,0,0,"[[], [wordpress], [before 3.1.5,]]","[[], [0.9997962], [0.99996156]]",[],[wordpress],"[before 3.1.5,]",[],[0.9997962],[0.99996156],[],[],[wordpress],[0.9997962],0,wordpress,wordpress,1,wordpress,wordpress,wordpress,"('wordpress', 'wordpress_mu', 'wordspew')"
111,CVE-2022-2696,138859,oracle,restaurant_menu_-_food_ordering_system_-_table_reservation,1.3.1,"The Restaurant Menu – Food Ordering System – Table Reservation plugin for WordPress is vulnerable to authorization bypass via several AJAX actions in versions up to, and including 2.3.0 due to mis...",cpe:2.3:a:oracle:restaurant_menu_-_food_ordering_system_-_table_reservation:1.3.1:*:*:*:*:wordpress:*:*,0,0,"[[], [table reservation], [up to, and including 2.3.0]]","[[], [0.99951965], [0.9989704]]",[],[table reservation],"[up to, and including 2.3.0]",[],[0.99951965],[0.9989704],[],[],[table reservation],[0.99951965],1,ereservations,enthrallweb,16,food-order-and-table-reservation-system,food-order-and-table-reservation-system_project,food-order-and-table-reservation-system_project,"('food-order-and-table-reservation-system',)"


An issue was discovered in the Responsive Poll through 1.3.4 for Wordpress. It allows an unauthenticated user to manipulate polls, e.g., delete, clone, or view a hidden poll. This is due to the usage of the callback wp_ajax_nopriv function in Includes/Total-Soft-Poll-Ajax.php for sensitive operations.

In [32]:
flag = df_test.apply(lambda x: x['product'] in x['matched_db_product_adv'], axis=1)
df_analyze = df_test[~flag][['cve_id', 'descr', 'product', 'product_ner', 'matched_db_product_adv']]
print(df_analyze.shape)
df_analyze

(84, 5)


Unnamed: 0,cve_id,descr,product,product_ner,matched_db_product_adv
2,CVE-2018-7279,A remote code execution issue was discovered in AlienVault USM and OSSIM before 5.5.1.,open_source_security_information_management,[usm],"('zeuscms',)"
3,CVE-2020-24743,"An issue was found in /showReports.do Zoho ManageEngine Applications Manager up to 14550, allows attackers to gain escalated privileges via the resourceid parameter.",manageengine_applications_manager,[applications manager],"('applications_manager', 'applications', 'applications_manager')"
4,CVE-2020-24786,"An issue was discovered in Zoho ManageEngine Exchange Reporter Plus before build number 5510, AD360 before build number 4228, ADSelfService Plus before build number 5817, DataSecurity Plus before ...",manageengine_o365_manager_plus,"[manageengine exchange reporter plus, adselfservice plus, datasecurity plus, recovermanager plus, eventlog analyzer, adaudit plus, o365 manager plus, cloud security plus, admanager plus, log360, j...","('java', 'java_communications_services_delegated_administrator', 'javamail')"
5,CVE-2013-3607,"Multiple stack-based buffer overflows in the web interface in the Intelligent Platform Management Interface (IPMI) implementation on Supermicro H8DC*, H8DG*, H8SCM-F, H8SGL-F, H8SM*, X7SP*, X8DT*,...",x9dax-if,[],['']
7,CVE-2018-15121,An issue was discovered in Auth0 auth0-aspnet and auth0-aspnet-owin. Affected packages do not use or validate the state parameter of the OAuth 2.0 and OpenID Connect protocols. This leaves applica...,aspnet,"[auth0-aspnet, oauth, openid connect]","('oauth', 'cloudtoken', 'sourcetree')"
11,CVE-2006-5093,PHP remote file inclusion vulnerability in index.php in Tagmin Control Center in TagIt! Tagboard 2.1.B Build 2 allows remote attackers to execute arbitrary PHP code via a URL in the page parameter.,tagmin_control_center,[tagboard],"('tagboard',)"
12,CVE-2022-3768,"The WPSmartContracts WordPress plugin before 1.3.12 does not properly sanitise and escape a parameter before using it in a SQL statement, leading to a SQL injection exploitable by users with a rol...",wpsmartcontracts,"[wpsmartcontracts, wordpress]","('wordpress', 'wordpress_mu', 'wordspew')"
16,CVE-2022-32169,The “Bytebase” application does not restrict low privilege user to access “admin issues“ for which an unauthorized user can view the “OPEN” and “CLOSED” issues by “Admin” and the affected endpoint...,bytebase,[],['']
17,CVE-2023-48795,"The SSH transport protocol with certain OpenSSH extensions, found in OpenSSH before 9.6 and other products, allows remote attackers to bypass integrity checks such that some packets are omitted (f...",filezilla_client,"[openssh, openssh, java ssh api, putty, asyncssh, libssh, libssh2, sftp gateway, tera term, paramiko, jsch, sftpgo, pfsense plus, hpn-ssh, proftpd, cyclonessh, xshell, crushftp, connectbot, ssh li...","('paramiko', 'paramiko', 'python')"
22,CVE-2022-4578,"The Video Conferencing with Zoom WordPress plugin before 4.0.10 does not validate and escape some of its shortcode attributes before outputting them back in the page, which could allow users with ...",video_conferencing_with_zoom,[zoom wordpress],"('zero_spam_for_wordpress', 'zero-spam')"


In [122]:
# Example text (replace with your CVE text)
cve_text = df_analyze.descr.iloc[33]
print(cve_text)

cve_text = '''
Firefly III (aka firefly-iii) before 6.1.1 allows webhooks HTML Injection.'''

xlockmore before 5.43 'dclock' security bypass vulnerability


In [123]:
token_classifier = pipeline(
    "token-classification", model=final_model, aggregation_strategy="first", tokenizer=final_tokenizer
)
token_classifier(cve_text)

Device set to use cpu


[{'entity_group': 'product',
  'score': 0.676458,
  'word': '\nFirefly',
  'start': 0,
  'end': 8},
 {'entity_group': 'product',
  'score': 0.8607605,
  'word': ' firefly-iii)',
  'start': 18,
  'end': 30},
 {'entity_group': 'version',
  'score': 0.999917,
  'word': ' before 6.1.1',
  'start': 31,
  'end': 43}]

In [128]:
extract_ners(cve_text)

Device set to use cpu


{'ners': [[], ['firefly', 'firefly-iii)'], ['before 6.1.1']],
 'scores': [[], ['0.676458', '0.8607605'], ['0.999917']]}

In [129]:
deduplicate_using_probs(extract_ners(cve_text)['ners'][1], extract_ners(cve_text)['scores'][1])

Device set to use cpu
Device set to use cpu


(['firefly-iii)'], ['0.8607605'])

In [54]:
# Tokenize input
inputs = final_tokenizer(
    cve_text,
    return_tensors="pt",
    truncation=True,
)

# Get model outputs (logits)
with torch.no_grad():
    outputs = final_model(**inputs)
logits = outputs.logits[0]  # Shape: [sequence_length, num_labels]

# Compute probabilities using softmax
probabilities = torch.softmax(logits, dim=-1).numpy()  # Convert to numpy array

# Get label names and tokens
label_names = final_model.config.id2label
tokens = final_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# Extract results for each token (excluding special tokens)
results = []
for i, token in enumerate(tokens):
    # Skip special tokens
    if token in [final_tokenizer.cls_token, final_tokenizer.sep_token, final_tokenizer.pad_token]:
        continue
    
    # Get probabilities for all labels
    token_probs = {
        label_names[label_id]: float(prob)
        for label_id, prob in enumerate(probabilities[i])
    }
    
    results.append({
        "token": token,
        "probabilities": token_probs
    })

# Now `results` contains probabilities for all labels per token
for res in results:
    print(f"Token: {res['token']}")
    for label, prob in res["probabilities"].items():
        print(f"  {label}: {prob:.4f}")
    print()

Token: Ġx
  O: 0.0039
  B-product: 0.0218
  I-product: 0.0002
  B-vendor: 0.0113
  I-vendor: 0.0000
  B-version: 0.9600
  I-version: 0.0029

Token: lock
  O: 0.0025
  B-product: 0.0175
  I-product: 0.0002
  B-vendor: 0.0083
  I-vendor: 0.0000
  B-version: 0.9671
  I-version: 0.0045

Token: more
  O: 0.0022
  B-product: 0.0107
  I-product: 0.0001
  B-vendor: 0.0056
  I-vendor: 0.0000
  B-version: 0.9734
  I-version: 0.0081

Token: Ġbefore
  O: 0.0024
  B-product: 0.0004
  I-product: 0.0001
  B-vendor: 0.0003
  I-vendor: 0.0001
  B-version: 0.2875
  I-version: 0.7092

Token: Ġ5
  O: 0.0000
  B-product: 0.0000
  I-product: 0.0000
  B-vendor: 0.0000
  I-vendor: 0.0000
  B-version: 0.0000
  I-version: 1.0000

Token: .
  O: 0.0000
  B-product: 0.0000
  I-product: 0.0000
  B-vendor: 0.0000
  I-vendor: 0.0000
  B-version: 0.0000
  I-version: 1.0000

Token: 43
  O: 0.0000
  B-product: 0.0000
  I-product: 0.0000
  B-vendor: 0.0000
  I-vendor: 0.0000
  B-version: 0.0000
  I-version: 1.0000

Token

Анализ ошибок на уровне модели:

* + CVE-2020-24743, CVE-2018-18551 -- manageengine_applications_manager разбился в тексте на вендора и продукт
* + CVE-2006-5093, CVE-2022-4578 -- не нашел нужный продукт в тексте
* + CVE-2022-3768 -- не нашел нужный продукт в тексте (подставил Wordpress)
* CVE-2022-32169 -- плохо токенизируемые символы (“)
* + CVE-2024-20803, CVE-2017-5703, CVE-2022-24436 -- продукта совсем нет в тексте описания (часто связаны с hardware или ОС)
* + CVE-2018-7279 -- продукта нет в тексте описания, только абревиатура
* 

In [70]:
s = '''
The Bytebase application does not restrict low privilege user to access “admin issues“ for which an unauthorized user can view the “OPEN” and “CLOSED” issues by “Admin” and the affected endpoint is “/issue”.
'''

In [71]:
extract_ners(s)

Device set to use cpu


{'ners': [['bytebase'], [], []], 'scores': [['0.90411174'], [], []]}

### Анализ дедубликации: шаг 2

* CVE-2018-15121, CVE-2022-32169 -- после дедубликации не тот продукт


In [72]:
df_test[df_test.cve_id.isin(['CVE-2018-15121', 'CVE-2022-32169'])]

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted,matched_db_product_lev,matched_db_vendor_lev,score_lev,matched_db_product_lcs,matched_db_vendor_lcs,matched_db_vendor_adv,matched_db_product_adv
7,CVE-2018-15121,522169,auth0,aspnet,-,An issue was discovered in Auth0 auth0-aspnet and auth0-aspnet-owin. Affected packages do not use or validate the state parameter of the OAuth 2.0 and OpenID Connect protocols. This leaves applica...,cpe:2.3:a:auth0:aspnet:-:*:*:*:*:*:*:*,1,1,"[[auth0], [auth0-aspnet, oauth, openid connect], [2.0]]","[[0.99983096], [0.92818856, 0.9774277, 0.88269305], [0.9996136]]",[auth0],"[auth0-aspnet, oauth, openid connect]",[2.0],[0.99983096],"[0.92818856, 0.9774277, 0.88269305]",[0.9996136],[auth0],[0.99983096],[oauth],[0.9774277],0,oauth,atlassian,1,oauth,atlassian,atlassian,"('oauth', 'cloudtoken', 'sourcetree')"
16,CVE-2022-32169,159810,bytebase,bytebase,0.8.1,The “Bytebase” application does not restrict low privilege user to access “admin issues“ for which an unauthorized user can view the “OPEN” and “CLOSED” issues by “Admin” and the affected endpoint...,cpe:2.3:a:bytebase:bytebase:0.8.1:*:*:*:*:*:*:*,1,1,"[[], [], []]","[[], [], []]",[],[],[],[],[],[],[],[],[],[],0,,,0,,,,['']


In [None]:
[['auth0'], ['auth0-aspnet', 'oauth', 'openid connect'], ['2.0']]

### Анализ генерации версий: шаг 3

In [73]:
df_test[df_test.true_version_in_predicted == 0]

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor_ner,dedup_vendor_score,dedup_product_ner,dedup_product_score,true_version_in_predicted,matched_db_product_lev,matched_db_vendor_lev,score_lev,matched_db_product_lcs,matched_db_vendor_lcs,matched_db_vendor_adv,matched_db_product_adv
1,CVE-2014-7221,722762,teamspeak,teamspeak3,3.0.7.1,TeamSpeak Client 3.0.14 and earlier allows remote authenticated users to cause a denial of service (buffer overflow and application crash) by connecting to a channel with a different client instan...,cpe:2.3:a:teamspeak:teamspeak3:3.0.7.1:*:*:*:client:*:*:*,1,0,"[[], [teamspeak], [3.0.14 and earlier]]","[[], [0.9999192], [0.9999383]]",[],[teamspeak],[3.0.14 and earlier],[],[0.9999192],[0.9999383],[],[],[teamspeak],[0.9999192],0,teamspeak,teamspeak,1,teamspeak,teamspeak,teamspeak,"('teamspeak', 'teamspeak3', 'teamspeak_client')"
2,CVE-2018-7279,541558,alienvault,open_source_security_information_management,5.3,A remote code execution issue was discovered in AlienVault USM and OSSIM before 5.5.1.,cpe:2.3:a:alienvault:open_source_security_information_management:5.3:*:*:*:*:*:*:*,1,0,"[[alienvault], [usm], [before 5.5.1.]]","[[0.9999037], [0.9814162], [0.99995714]]",[alienvault],[usm],[before 5.5.1.],[0.9999037],[0.9814162],[0.99995714],[alienvault],[0.9999037],[usm],[0.9814162],0,scm,scm_project,3,zeuscms,zeuscms,zeuscms,"('zeuscms',)"
3,CVE-2020-24743,472694,zohocorp,manageengine_applications_manager,14.5,"An issue was found in /showReports.do Zoho ManageEngine Applications Manager up to 14550, allows attackers to gain escalated privileges via the resourceid parameter.",cpe:2.3:a:zohocorp:manageengine_applications_manager:14.5:build14540:*:*:*:*:*:*,0,0,"[[manageengine], [applications manager], [up to 14550,]]","[[0.95109195], [0.888083], [0.9999476]]",[manageengine],[applications manager],"[up to 14550,]",[0.95109195],[0.888083],[0.9999476],[manageengine],[0.95109195],[applications manager],[0.888083],0,applications_manager,manageengine,19,applications_manager,manageengine,manageengine,"('applications_manager', 'applications', 'applications_manager')"
4,CVE-2020-24786,472744,zohocorp,manageengine_o365_manager_plus,4.3,"An issue was discovered in Zoho ManageEngine Exchange Reporter Plus before build number 5510, AD360 before build number 4228, ADSelfService Plus before build number 5817, DataSecurity Plus before ...",cpe:2.3:a:zohocorp:manageengine_o365_manager_plus:4.3:4304:*:*:*:*:*:*,0,0,"[[zoho], [manageengine exchange reporter plus, adselfservice plus, datasecurity plus, recovermanager plus, eventlog analyzer, adaudit plus, o365 manager plus, cloud security plus, admanager plus, ...","[[0.98405576], [0.9567048, 0.88903725, 0.9884461, 0.9938674, 0.98616344, 0.9817395, 0.962745, 0.9839506, 0.94795024, 0.59385043, 0.99802125], []]",[zoho],"[manageengine exchange reporter plus, adselfservice plus, datasecurity plus, recovermanager plus, eventlog analyzer, adaudit plus, o365 manager plus, cloud security plus, admanager plus, log360, j...",[],[0.98405576],"[0.9567048, 0.88903725, 0.9884461, 0.9938674, 0.98616344, 0.9817395, 0.962745, 0.9839506, 0.94795024, 0.59385043, 0.99802125]",[],[zoho],[0.98405576],[java servlet],[0.99802125],0,java_asp_server,sun,11,java_communications_services_delegated_administrator,sun,sun,"('java', 'java_communications_services_delegated_administrator', 'javamail')"
5,CVE-2013-3607,553572,supermicro,x9dax-if,-,"Multiple stack-based buffer overflows in the web interface in the Intelligent Platform Management Interface (IPMI) implementation on Supermicro H8DC*, H8DG*, H8SCM-F, H8SGL-F, H8SM*, X7SP*, X8DT*,...",cpe:2.3:h:supermicro:x9dax-if:-:*:*:*:*:*:*:*,1,0,"[[supermicro], [], []]","[[0.9989392], [], []]",[supermicro],[],[],[0.9989392],[],[],[supermicro],[0.9989392],[],[],0,,,0,,,,['']
7,CVE-2018-15121,522169,auth0,aspnet,-,An issue was discovered in Auth0 auth0-aspnet and auth0-aspnet-owin. Affected packages do not use or validate the state parameter of the OAuth 2.0 and OpenID Connect protocols. This leaves applica...,cpe:2.3:a:auth0:aspnet:-:*:*:*:*:*:*:*,1,1,"[[auth0], [auth0-aspnet, oauth, openid connect], [2.0]]","[[0.99983096], [0.92818856, 0.9774277, 0.88269305], [0.9996136]]",[auth0],"[auth0-aspnet, oauth, openid connect]",[2.0],[0.99983096],"[0.92818856, 0.9774277, 0.88269305]",[0.9996136],[auth0],[0.99983096],[oauth],[0.9774277],0,oauth,atlassian,1,oauth,atlassian,atlassian,"('oauth', 'cloudtoken', 'sourcetree')"
8,CVE-2013-2175,549900,haproxy,haproxy,1.4.17,"HAProxy 1.4 before 1.4.24 and 1.5 before 1.5-dev19, when configured to use hdr_ip or other ""hdr_*"" functions with a negative occurrence count, allows remote attackers to cause a denial of service ...",cpe:2.3:a:haproxy:haproxy:1.4.17:*:*:*:*:*:*:*,1,1,"[[], [haproxy], [1.4 before 1.4.24, 1.5 before 1.5-dev19,]]","[[], [0.9999064], [0.9999666, 0.9999676]]",[],[haproxy],"[1.4 before 1.4.24, 1.5 before 1.5-dev19,]",[],[0.9999064],"[0.9999666, 0.9999676]",[],[],[haproxy],[0.9999064],0,haproxy,netgate,1,haproxy,netgate,haproxy,"('haproxy', 'haproxy', 'proxyprotocol')"
10,CVE-2023-4393,375964,liquidfiles,liquidfiles,1.6.23,"HTML and SMTP injections on the registration page of LiquidFiles versions 3.7.13 and below, allow an attacker to perform more advanced phishing attacks against an organization.",cpe:2.3:a:liquidfiles:liquidfiles:1.6.23:*:*:*:*:*:*:*,1,1,"[[], [liquidfiles], [3.7.13 and below,]]","[[], [0.99991], [0.83767396]]",[],[liquidfiles],"[3.7.13 and below,]",[],[0.99991],[0.83767396],[],[],[liquidfiles],[0.99991],0,liquidfiles,liquidfiles,1,liquidfiles,liquidfiles,liquidfiles,"('liquidfiles',)"
11,CVE-2006-5093,593227,paul_schudar,tagmin_control_center,2.1.b_build_2,PHP remote file inclusion vulnerability in index.php in Tagmin Control Center in TagIt! Tagboard 2.1.B Build 2 allows remote attackers to execute arbitrary PHP code via a URL in the page parameter.,cpe:2.3:a:paul_schudar:tagmin_control_center:2.1.b_build_2:*:*:*:*:*:*:*,0,0,"[[], [tagboard], [2.1.b]]","[[], [0.99991417], [0.9999603]]",[],[tagboard],[2.1.b],[],[0.99991417],[0.9999603],[],[],[tagboard],[0.99991417],0,tagboard,tagit,1,tagboard,tagit,tagit,"('tagboard',)"
13,CVE-2014-4700,719860,citrix,xendesktop,4.0,"Citrix XenDesktop 7.x, 5.x, and 4.x, when pooled random desktop groups is enabled and ShutdownDesktopsAfterUse is disabled, allows local guest users to gain access to another user's desktop via un...",cpe:2.3:a:citrix:xendesktop:4.0:*:*:*:*:*:*:*,1,1,"[[citrix], [xendesktop], [7.x,, 5.x,, 4.x,]]","[[0.999882], [0.9998907], [0.9999678, 0.9999691, 0.9999697]]",[citrix],[xendesktop],"[7.x,, 5.x,, 4.x,]",[0.999882],[0.9998907],"[0.9999678, 0.9999691, 0.9999697]",[citrix],[0.999882],[xendesktop],[0.9998907],0,xendesktop,citrix,1,xendesktop,citrix,citrix,"('xen', 'xendesktop', 'xp')"


In [78]:
df_bad_versions = df_test[df_test.true_version_in_predicted == 0]

In [79]:
df_bad_versions['version_len'] = df_bad_versions.version.str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad_versions['version_len'] = df_bad_versions.version.str.len()


In [80]:
df_bad_versions['version_len'].value_counts()

5.0     20
3.0     15
6.0     12
7.0     11
1.0      9
4.0      7
12.0     3
8.0      2
13.0     1
16.0     1
Name: version_len, dtype: int64

In [100]:
df_bad_versions[['cve_id', 'version', 'version_ner']]

Unnamed: 0,cve_id,version,version_ner
1,CVE-2014-7221,3.0.7.1,[3.0.14 and earlier]
2,CVE-2018-7279,5.3,[before 5.5.1.]
3,CVE-2020-24743,14.5,"[up to 14550,]"
4,CVE-2020-24786,4.3,[]
5,CVE-2013-3607,-,[]
7,CVE-2018-15121,-,[2.0]
8,CVE-2013-2175,1.4.17,"[1.4 before 1.4.24, 1.5 before 1.5-dev19,]"
10,CVE-2023-4393,1.6.23,"[3.7.13 and below,]"
11,CVE-2006-5093,2.1.b_build_2,[2.1.b]
13,CVE-2014-4700,4.0,"[7.x,, 5.x,, 4.x,]"


* у 6 записей нет СРЕ для проверки
* ['before 3.0.367']

In [114]:
classify_version_string('3.7.13 and below')

(['3.7.13'], 'other')

In [94]:
cve_2_all_versions.keys()

dict_keys(['CVE-2021-34085', 'CVE-2014-7221', 'CVE-2018-7279', 'CVE-2020-24743', 'CVE-2020-24786', 'CVE-2013-3607', 'CVE-2019-13183', 'CVE-2018-15121', 'CVE-2013-2175', 'CVE-2016-10714', 'CVE-2023-4393', 'CVE-2006-5093', 'CVE-2022-3768', 'CVE-2014-4700', 'CVE-2015-8076', 'CVE-2019-14862', 'CVE-2022-32169', 'CVE-2023-48795', 'CVE-2021-26754', 'CVE-2023-23723', 'CVE-2023-52323', 'CVE-2005-2556', 'CVE-2022-4578', 'CVE-2023-6998', 'CVE-2015-9438', 'CVE-2005-0064', 'CVE-2022-23179', 'CVE-2022-34868', 'CVE-2020-7981', 'CVE-2021-4306', 'CVE-2024-24831', 'CVE-2022-24709', 'CVE-2023-34104', 'CVE-2016-1409', 'CVE-2024-20803', 'CVE-2013-6440', 'CVE-2023-20903', 'CVE-2018-18551', 'CVE-2016-2054', 'CVE-2024-1078', 'CVE-2014-1740', 'CVE-2023-28110', 'CVE-2021-45099', 'CVE-2017-5703', 'CVE-2024-22075', 'CVE-2018-12491', 'CVE-2019-9606', 'CVE-2007-2829', 'CVE-2023-0816', 'CVE-2023-48226', 'CVE-2023-46234', 'CVE-2023-51652', 'CVE-2007-0851', 'CVE-2020-10591', 'CVE-2022-2089', 'CVE-2020-23622', 'CVE-202

In [115]:
list(filter(lambda x: x.split('.')[0] == '3', cve_2_all_versions['CVE-2014-7221']))

['3.0.0',
 '3.0.1',
 '3.0.2',
 '3.0.3',
 '3.0.4',
 '3.0.5',
 '3.0.6',
 '3.0.7',
 '3.0.8',
 '3.0.9',
 '3.0.10',
 '3.0.11',
 '3.0.12',
 '3.0.13',
 '3.0.14']

### Анализ поиска в БД: шаг 4

CVE-2024-22075, CVE-2023-0816, CVE-2023-51652, CVE-2020-23622 -- вроде нашел правильно продукт, почему не подтянул?

In [None]:
df

In [137]:
df_test[df_test.cve_id == "CVE-2023-0816"][['vendor', 'product', 'product_ner', 'dedup_product_ner', 'matched_db_product_adv']]

Unnamed: 0,vendor,product,product_ner,dedup_product_ner,matched_db_product_adv
48,strategy11,formidable_form_builder,[formidable forms wordpress],[formidable forms wordpress],"('post_form', 'post_form_registration_form_profile_form_for_user_profiles_and_content_forms', 'buddyforms')"


In [132]:
pr = 'formidable forms wordpress'

In [133]:
# Проверка работы
ratio_scores = []
for ent in unique_products:
    ratio_scores.append(ratio(pr, ent))
ratio_scores = np.array(ratio_scores)
print(sorted(ratio_scores, reverse=True)[:5])
n = list(np.argwhere(ratio_scores >= 0.7).reshape(1, -1))
unique_products[n]

[0.7142857142857143, 0.6938775510204082, 0.653061224489796, 0.6363636363636364, 0.6122448979591837]


  unique_products[n]


array(['formidable_forms'], dtype=object)

In [139]:
pylcs.lcs(pr, 'formidable_form_builder')

16

In [140]:

(prod, score)= get_lcs(pr, unique_products)
print(f'Found product in DB: {prod} with score {score}')
df_all = get_df_from_bd(f"select distinct vendor, product from cpes where vendor in (select vendor from cpes where product = '{prod}')")
found_candidates = retrieve_top_k(prod, df_all['product'].tolist(), top_k=10)
found_candidates

Found product in DB: post_form_registration_form_profile_form_for_user_profiles_and_content_forms with score 19


('post_form',
 'post_form_registration_form_profile_form_for_user_profiles_and_content_forms',
 'buddyforms',
 'tk_google_fonts_gdpr_compliant')

In [136]:
df_all

Unnamed: 0,vendor,product
0,themekraft,buddyforms
1,themekraft,post_form
2,themekraft,post_form_registration_form_profile_form_for_user_profiles_and_content_forms
3,themekraft,tk_google_fonts_gdpr_compliant
