In [3]:
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                         pipeline)
import pandas as pd
import re
from itertools import product
from pprint import pprint
import numpy as np
import pylcs
import psycopg2 as p2
from psycopg2 import sql
from collections import Counter
from tqdm import tqdm
from rapidfuzz import fuzz
from Levenshtein import ratio

pd.set_option('display.width', 20000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

In [4]:
dbname = "vulns_scanner"
user = 'postgres'
password = 'postgres'
host = 'localhost'
port = '5432'

In [5]:
def get_df_from_bd(q):
    conn = p2.connect(dbname=dbname, user=user, password=password, host=host, port=port)
    cur = conn.cursor()
    cur.execute(q)
    colnames = [desc[0] for desc in cur.description]
    tuples = cur.fetchall()
    cur.close()
    df = pd.DataFrame(tuples, columns=colnames)
    return df

In [13]:
path_to_model = "/home/mikhail/Documents/pandan_study/vkr/vulns_scanner/mikhail_code/models/nuner_180525_full_dataset"
final_tokenizer = AutoTokenizer.from_pretrained(path_to_model, use_fast=True, add_prefix_space=True, local_files_only=True)
final_model = AutoModelForTokenClassification.from_pretrained(path_to_model, local_files_only=True)


In [14]:
df_test = pd.read_csv('df_200_not_in_stucco_v3_180525.csv')

In [15]:
def extract_ners(cve, tokenizer=final_tokenizer, model=final_model):
    token_classifier = pipeline(
        "token-classification", model=final_model, aggregation_strategy="first", tokenizer=final_tokenizer
    )
    result = token_classifier(cve)
    vendor = []
    product = []
    version = []
    vendor_probs = []
    product_probs = []
    version_probs = []

    for ner_item in result:
        if ner_item['entity_group'] == 'vendor':
            vendor.append(str.lower(ner_item['word'].strip()))
            vendor_probs.append(str.lower(str(ner_item['score'])))
        elif ner_item['entity_group'] == 'product':
            product.append(str.lower(str(ner_item['word'].strip())))
            product_probs.append(str.lower(str(ner_item['score'])))
        elif ner_item['entity_group'] == 'version':
            version.append(str.lower(str(ner_item['word'].strip())))
            version_probs.append(str.lower(str(ner_item['score'])))
    return {'ners': [vendor, product, version], 'scores': [vendor_probs, product_probs, version_probs]}

In [110]:
s = 'OX App Suite through 7.10.3 allows Information Exposure because a user can obtain the IP address and User-Agent string of a different user (via the session API during shared Drive access).'
extract_ners(s)

Device set to use cpu


{'ners': [[], ['ox app suite'], ['through 7.10.3']],
 'scores': [[], ['0.9998378'], ['0.9999568']]}

In [16]:
df_test[['ners_list', 'scores_list']] = df_test['descr'].apply(lambda x: extract_ners(x)).apply(pd.Series)
df_test['vendor_ner'], df_test['product_ner'], df_test['version_ner'] = zip(*df_test['ners_list'])
df_test['vendor_score_ner'], df_test['product_score_ner'], df_test['version_score_ner'] = zip(*df_test['scores_list'])

Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set

In [17]:
def deduplicate_using_probs(row, ner_type):
    if ner_type == 'product':
        product_ner = row['product_ner']
        product_score_ner = row['product_score_ner']
        if not product_ner or len(product_ner) == 1:
            return product_ner, product_score_ner
        else:
            max_idx = product_score_ner.index(max(product_score_ner))
            return [[product_ner[max_idx]], [product_score_ner[max_idx]]]
    elif ner_type == 'vendor':
        vendor_ner = row['vendor_ner']
        vendor_score_ner = row['vendor_score_ner']
        if not vendor_ner or len(vendor_ner) == 1:
            return vendor_ner, vendor_score_ner
        else:
            max_idx = vendor_score_ner.index(max(vendor_score_ner))
            return [[vendor_ner[max_idx]], [vendor_score_ner[max_idx]]]



In [18]:
df_test[['dedup_vendor', 'dedup_vendor_score']] = df_test.apply(lambda x: deduplicate_using_probs(x, 'vendor'), axis=1).apply(pd.Series)
df_test[['dedup_product', 'dedup_product_score']] = df_test.apply(lambda x: deduplicate_using_probs(x, 'product'), axis=1).apply(pd.Series)

In [98]:
df_test

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor,dedup_vendor_score,dedup_product,dedup_product_score,true_version_in_predicted,matched_db_product,matched_db_vendor
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1,mp3gain,glensawyer
1,CVE-2014-7221,722762,teamspeak,teamspeak3,3.0.7.1,TeamSpeak Client 3.0.14 and earlier allows remote authenticated users to cause a denial of service (buffer overflow and application crash) by connecting to a channel with a different client instan...,cpe:2.3:a:teamspeak:teamspeak3:3.0.7.1:*:*:*:client:*:*:*,1,0,"[[], [teamspeak], [3.0.14 and earlier]]","[[], [0.9999192], [0.9999383]]",[],[teamspeak],[3.0.14 and earlier],[],[0.9999192],[0.9999383],[],[],[teamspeak],[0.9999192],0,teamspeak,teamspeak
2,CVE-2018-7279,541558,alienvault,open_source_security_information_management,5.3,A remote code execution issue was discovered in AlienVault USM and OSSIM before 5.5.1.,cpe:2.3:a:alienvault:open_source_security_information_management:5.3:*:*:*:*:*:*:*,1,0,"[[alienvault], [usm], [before 5.5.1.]]","[[0.9999037], [0.9814162], [0.99995714]]",[alienvault],[usm],[before 5.5.1.],[0.9999037],[0.9814162],[0.99995714],[alienvault],[0.9999037],[usm],[0.9814162],0,usermin,usermin
3,CVE-2020-24743,472694,zohocorp,manageengine_applications_manager,14.5,"An issue was found in /showReports.do Zoho ManageEngine Applications Manager up to 14550, allows attackers to gain escalated privileges via the resourceid parameter.",cpe:2.3:a:zohocorp:manageengine_applications_manager:14.5:build14540:*:*:*:*:*:*,0,0,"[[manageengine], [applications manager], [up to 14550,]]","[[0.95109195], [0.888083], [0.9999476]]",[manageengine],[applications manager],"[up to 14550,]",[0.95109195],[0.888083],[0.9999476],[manageengine],[0.95109195],[applications manager],[0.888083],0,applications_manager,manageengine
4,CVE-2020-24786,472744,zohocorp,manageengine_o365_manager_plus,4.3,"An issue was discovered in Zoho ManageEngine Exchange Reporter Plus before build number 5510, AD360 before build number 4228, ADSelfService Plus before build number 5817, DataSecurity Plus before ...",cpe:2.3:a:zohocorp:manageengine_o365_manager_plus:4.3:4304:*:*:*:*:*:*,0,0,"[[zoho], [manageengine exchange reporter plus, adselfservice plus, datasecurity plus, recovermanager plus, eventlog analyzer, adaudit plus, o365 manager plus, cloud security plus, admanager plus, ...","[[0.98405576], [0.9567048, 0.88903725, 0.9884461, 0.9938674, 0.98616344, 0.9817395, 0.962745, 0.9839506, 0.94795024, 0.59385043, 0.99802125], []]",[zoho],"[manageengine exchange reporter plus, adselfservice plus, datasecurity plus, recovermanager plus, eventlog analyzer, adaudit plus, o365 manager plus, cloud security plus, admanager plus, log360, j...",[],[0.98405576],"[0.9567048, 0.88903725, 0.9884461, 0.9938674, 0.98616344, 0.9817395, 0.962745, 0.9839506, 0.94795024, 0.59385043, 0.99802125]",[],[zoho],[0.98405576],[java servlet],[0.99802125],0,java_communications_services_delegated_administrator,sun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,CVE-2022-3550,169910,x.org,x_server,1.13.1,A vulnerability classified as critical was found in X.org Server. Affected by this vulnerability is the function _GetCountedString of the file xkb/xkb.c. The manipulation leads to buffer overflow....,cpe:2.3:a:x.org:x_server:1.13.1:*:*:*:*:*:*:*,1,0,"[[], [x.org server.], []]","[[], [0.9496948], []]",[],[x.org server.],[],[],[0.9496948],[],[],[],[x.org server.],[0.9496948],0,x.org-xserver,x
196,CVE-2020-11673,451620,total-soft,responsive_poll,1.2.2,"An issue was discovered in the Responsive Poll through 1.3.4 for Wordpress. It allows an unauthenticated user to manipulate polls, e.g., delete, clone, or view a hidden poll. This is due to the us...",cpe:2.3:a:total-soft:responsive_poll:1.2.2:*:*:*:*:wordpress:*:*,1,0,"[[], [responsive poll], [through 1.3.4]]","[[], [0.9997986], [0.9999593]]",[],[responsive poll],[through 1.3.4],[],[0.9997986],[0.9999593],[],[],[responsive poll],[0.9997986],1,responsive_poll,total-soft
197,CVE-2022-1253,80875,struktur,libde265,1.0.3,Heap-based Buffer Overflow in GitHub repository strukturag/libde265 prior to and including 1.0.8. The fix is established in commit 8e89fe0e175d2870c39486fdd09250b230ec10b8 but does not yet belong ...,cpe:2.3:a:struktur:libde265:1.0.3:*:*:*:*:*:*:*,1,1,"[[], [], [prior to and including 1.0.8.]]","[[], [], [0.99994123]]",[],[],[prior to and including 1.0.8.],[],[],[0.99994123],[],[],[],[],1,,
198,CVE-2019-20903,705053,atlassian,editor-core,98.2.2,The hyperlinks functionality in atlaskit/editor-core in before version 113.1.5 allows remote attackers to inject arbitrary HTML or JavaScript via a Cross-Site Scripting (XSS) vulnerability in link...,cpe:2.3:a:atlassian:editor-core:98.2.2:*:*:*:*:node.js:*:*,0,1,"[[], [], [before version 113.1.5]]","[[], [], [0.99994594]]",[],[],[before version 113.1.5],[],[],[0.99994594],[],[],[],[],0,,


In [125]:
df_test[df_test.version.str.len().isin([6,5])]

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor,dedup_vendor_score,dedup_product,dedup_product_score,true_version_in_predicted,matched_db_product,matched_db_vendor
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1,mp3gain,glensawyer
6,CVE-2019-13183,689916,flarum,flarum,0.1.0,"Flarum before 0.1.0-beta.9 allows CSRF against all POST endpoints, as demonstrated by changing admin settings.",cpe:2.3:a:flarum:flarum:0.1.0:beta8.1:*:*:*:*:*:*,1,1,"[[], [flarum], [before 0.1.0-beta.9]]","[[], [0.9974808], [0.99994767]]",[],[flarum],[before 0.1.0-beta.9],[],[0.9974808],[0.99994767],[],[],[flarum],[0.9974808],1,flarum,flarum
8,CVE-2013-2175,549900,haproxy,haproxy,1.4.17,"HAProxy 1.4 before 1.4.24 and 1.5 before 1.5-dev19, when configured to use hdr_ip or other ""hdr_*"" functions with a negative occurrence count, allows remote attackers to cause a denial of service ...",cpe:2.3:a:haproxy:haproxy:1.4.17:*:*:*:*:*:*:*,1,1,"[[], [haproxy], [1.4 before 1.4.24, 1.5 before 1.5-dev19,]]","[[], [0.9999064], [0.9999666, 0.9999676]]",[],[haproxy],"[1.4 before 1.4.24, 1.5 before 1.5-dev19,]",[],[0.9999064],"[0.9999666, 0.9999676]",[],[],[haproxy],[0.9999064],0,haproxy,netgate
9,CVE-2016-10714,422757,zsh,zsh,4.2.2,"In zsh before 5.3, an off-by-one error resulted in undersized buffers that were intended to support PATH_MAX characters.",cpe:2.3:a:zsh:zsh:4.2.2:*:*:*:*:*:*:*,1,1,"[[], [zsh], [before 5.3,]]","[[], [0.9995414], [0.99995625]]",[],[zsh],"[before 5.3,]",[],[0.9995414],[0.99995625],[],[],[zsh],[0.9995414],1,zsh,zsh
10,CVE-2023-4393,375964,liquidfiles,liquidfiles,1.6.23,"HTML and SMTP injections on the registration page of LiquidFiles versions 3.7.13 and below, allow an attacker to perform more advanced phishing attacks against an organization.",cpe:2.3:a:liquidfiles:liquidfiles:1.6.23:*:*:*:*:*:*:*,1,1,"[[], [liquidfiles], [3.7.13 and below,]]","[[], [0.99991], [0.83767396]]",[],[liquidfiles],"[3.7.13 and below,]",[],[0.99991],[0.83767396],[],[],[liquidfiles],[0.99991],0,liquidfiles,liquidfiles
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,CVE-2022-3550,169910,x.org,x_server,1.13.1,A vulnerability classified as critical was found in X.org Server. Affected by this vulnerability is the function _GetCountedString of the file xkb/xkb.c. The manipulation leads to buffer overflow....,cpe:2.3:a:x.org:x_server:1.13.1:*:*:*:*:*:*:*,1,0,"[[], [x.org server.], []]","[[], [0.9496948], []]",[],[x.org server.],[],[],[0.9496948],[],[],[],[x.org server.],[0.9496948],0,x.org-xserver,x
196,CVE-2020-11673,451620,total-soft,responsive_poll,1.2.2,"An issue was discovered in the Responsive Poll through 1.3.4 for Wordpress. It allows an unauthenticated user to manipulate polls, e.g., delete, clone, or view a hidden poll. This is due to the us...",cpe:2.3:a:total-soft:responsive_poll:1.2.2:*:*:*:*:wordpress:*:*,1,0,"[[], [responsive poll], [through 1.3.4]]","[[], [0.9997986], [0.9999593]]",[],[responsive poll],[through 1.3.4],[],[0.9997986],[0.9999593],[],[],[responsive poll],[0.9997986],1,responsive_poll,total-soft
197,CVE-2022-1253,80875,struktur,libde265,1.0.3,Heap-based Buffer Overflow in GitHub repository strukturag/libde265 prior to and including 1.0.8. The fix is established in commit 8e89fe0e175d2870c39486fdd09250b230ec10b8 but does not yet belong ...,cpe:2.3:a:struktur:libde265:1.0.3:*:*:*:*:*:*:*,1,1,"[[], [], [prior to and including 1.0.8.]]","[[], [], [0.99994123]]",[],[],[prior to and including 1.0.8.],[],[],[0.99994123],[],[],[],[],1,,
198,CVE-2019-20903,705053,atlassian,editor-core,98.2.2,The hyperlinks functionality in atlaskit/editor-core in before version 113.1.5 allows remote attackers to inject arbitrary HTML or JavaScript via a Cross-Site Scripting (XSS) vulnerability in link...,cpe:2.3:a:atlassian:editor-core:98.2.2:*:*:*:*:node.js:*:*,0,1,"[[], [], [before version 113.1.5]]","[[], [], [0.99994594]]",[],[],[before version 113.1.5],[],[],[0.99994594],[],[],[],[],0,,


In [99]:
print(df_test['vendor_in_text'].sum())
df_test[(df_test['vendor_ner'].astype(str) != '[]') & 
        (df_test['vendor_in_text'] == 1)]

107


Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor,dedup_vendor_score,dedup_product,dedup_product_score,true_version_in_predicted,matched_db_product,matched_db_vendor
2,CVE-2018-7279,541558,alienvault,open_source_security_information_management,5.3,A remote code execution issue was discovered in AlienVault USM and OSSIM before 5.5.1.,cpe:2.3:a:alienvault:open_source_security_information_management:5.3:*:*:*:*:*:*:*,1,0,"[[alienvault], [usm], [before 5.5.1.]]","[[0.9999037], [0.9814162], [0.99995714]]",[alienvault],[usm],[before 5.5.1.],[0.9999037],[0.9814162],[0.99995714],[alienvault],[0.9999037],[usm],[0.9814162],0,usermin,usermin
5,CVE-2013-3607,553572,supermicro,x9dax-if,-,"Multiple stack-based buffer overflows in the web interface in the Intelligent Platform Management Interface (IPMI) implementation on Supermicro H8DC*, H8DG*, H8SCM-F, H8SGL-F, H8SM*, X7SP*, X8DT*,...",cpe:2.3:h:supermicro:x9dax-if:-:*:*:*:*:*:*:*,1,0,"[[supermicro], [], []]","[[0.9989392], [], []]",[supermicro],[],[],[0.9989392],[],[],[supermicro],[0.9989392],[],[],0,,
7,CVE-2018-15121,522169,auth0,aspnet,-,An issue was discovered in Auth0 auth0-aspnet and auth0-aspnet-owin. Affected packages do not use or validate the state parameter of the OAuth 2.0 and OpenID Connect protocols. This leaves applica...,cpe:2.3:a:auth0:aspnet:-:*:*:*:*:*:*:*,1,1,"[[auth0], [auth0-aspnet, oauth, openid connect], [2.0]]","[[0.99983096], [0.92818856, 0.9774277, 0.88269305], [0.9996136]]",[auth0],"[auth0-aspnet, oauth, openid connect]",[2.0],[0.99983096],"[0.92818856, 0.9774277, 0.88269305]",[0.9996136],[auth0],[0.99983096],[oauth],[0.9774277],0,oauth,atlassian
13,CVE-2014-4700,719860,citrix,xendesktop,4.0,"Citrix XenDesktop 7.x, 5.x, and 4.x, when pooled random desktop groups is enabled and ShutdownDesktopsAfterUse is disabled, allows local guest users to gain access to another user's desktop via un...",cpe:2.3:a:citrix:xendesktop:4.0:*:*:*:*:*:*:*,1,1,"[[citrix], [xendesktop], [7.x,, 5.x,, 4.x,]]","[[0.999882], [0.9998907], [0.9999678, 0.9999691, 0.9999697]]",[citrix],[xendesktop],"[7.x,, 5.x,, 4.x,]",[0.999882],[0.9998907],"[0.9999678, 0.9999691, 0.9999697]",[citrix],[0.999882],[xendesktop],[0.9998907],0,xendesktop,citrix
23,CVE-2023-6998,414164,coolkit,ewelink,4.13.1,Improper privilege management vulnerability in CoolKit Technology eWeLink on Android and iOS allows application lockscreen bypass.This issue affects eWeLink before 5.2.0.\n\n,cpe:2.3:a:coolkit:ewelink:4.13.1:*:*:*:*:android:*:*,1,1,"[[coolkit], [ewelink, ewelink], [before 5.2.0.]]","[[0.9544912], [0.9998672, 0.9998958], [0.9999499]]",[coolkit],"[ewelink, ewelink]",[before 5.2.0.],[0.9544912],"[0.9998672, 0.9998958]",[0.9999499],[coolkit],[0.9544912],[ewelink],[0.9998958],1,ewelink,coolkit
33,CVE-2016-1409,426211,cisco,ios,12.1\(1\)xd2,"The Neighbor Discovery (ND) protocol implementation in the IPv6 stack in Cisco IOS XE 2.1 through 3.17S, IOS XR 2.0.0 through 5.3.2, and NX-OS allows remote attackers to cause a denial of service ...",cpe:2.3:o:cisco:ios:12.1\(1\)xd2:*:*:*:*:*:*:*,1,1,"[[cisco], [], [2.1 through 3.17s,, 2.0.0 through 5.3.2,]]","[[0.9998908], [], [0.9999612, 0.9999633]]",[cisco],[],"[2.1 through 3.17s,, 2.0.0 through 5.3.2,]",[0.9998908],[],"[0.9999612, 0.9999633]",[cisco],[0.9998908],[],[],0,,
40,CVE-2014-1740,715763,google,chrome,34.0.1847.78,Multiple use-after-free vulnerabilities in net/websockets/websocket_job.cc in the WebSockets implementation in Google Chrome before 34.0.1847.137 allow remote attackers to cause a denial of servic...,cpe:2.3:a:google:chrome:34.0.1847.78:*:*:*:*:*:*:*,1,1,"[[google], [chrome], [before 34.0.1847.137]]","[[0.9998888], [0.9999132], [0.99996156]]",[google],[chrome],[before 34.0.1847.137],[0.9998888],[0.9999132],[0.99996156],[google],[0.9998888],[chrome],[0.9999132],0,chrome,google
53,CVE-2020-10591,448076,walmart,concord,0.72.0,"An issue was discovered in Walmart Labs Concord before 1.44.0. CORS Access-Control-Allow-Origin headers have a potentially unsafe dependency on Origin headers, and are not configurable. This allow...",cpe:2.3:a:walmart:concord:0.72.0:*:*:*:*:*:*:*,1,1,"[[walmart], [], [before 1.44.0.]]","[[0.8405717], [], [0.9999189]]",[walmart],[],[before 1.44.0.],[0.8405717],[],[0.9999189],[walmart],[0.8405717],[],[],1,,
58,CVE-2022-4876,219422,kaltura,mwembed,2.41,A vulnerability was found in Kaltura mwEmbed up to 2.96.rc1 and classified as problematic. This issue affects some unknown processing of the file includes/DefaultSettings.php. The manipulation of ...,cpe:2.3:a:kaltura:mwembed:2.41:rc2:*:*:*:*:*:*,1,1,"[[kaltura], [mwembed], [up to 2.96.rc1, to, upgrading to version 2.96.rc2, to]]","[[0.97595686], [0.9998877], [0.9999277, 0.5686604, 0.9999056, 0.6301625]]",[kaltura],[mwembed],"[up to 2.96.rc1, to, upgrading to version 2.96.rc2, to]",[0.97595686],[0.9998877],"[0.9999277, 0.5686604, 0.9999056, 0.6301625]",[kaltura],[0.97595686],[mwembed],[0.9998877],1,mwembed,kaltura
65,CVE-2023-49183,393386,nextscripts,social_networks_auto_poster,4.3.14,Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting') vulnerability in NextScripts NextScripts: Social Networks Auto-Poster allows Reflected XSS.This issue affects N...,cpe:2.3:a:nextscripts:social_networks_auto_poster:4.3.14:*:*:*:*:wordpress:*:*,1,0,"[[nextscripts, nextscripts:], [], [through 4.4.2.]]","[[0.9993755, 0.7670635], [], [0.9999251]]","[nextscripts, nextscripts:]",[],[through 4.4.2.],"[0.9993755, 0.7670635]",[],[0.9999251],[nextscripts],[0.9993755],[],[],1,,


In [100]:
print(df_test['product_in_text'].sum())
df_test[(df_test['product_ner'].astype(str) != '[]') & 
        (df_test['product_in_text'] == 1)]

114


Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,vendor_in_text,product_in_text,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor,dedup_vendor_score,dedup_product,dedup_product_score,true_version_in_predicted,matched_db_product,matched_db_vendor
0,CVE-2021-34085,628902,glensawyer,mp3gain,1.3.4,Read access violation in the III_dequantize_sample function in mpglibDBL/layer3.c in mp3gain through 1.5.2-r2 allows remote attackers to cause a denial of service (application crash) or possibly h...,cpe:2.3:a:glensawyer:mp3gain:1.3.4:beta:*:*:*:*:*:*,0,1,"[[], [mp3gain], [through 1.5.2-r2]]","[[], [0.99991965], [0.99996096]]",[],[mp3gain],[through 1.5.2-r2],[],[0.99991965],[0.99996096],[],[],[mp3gain],[0.99991965],1,mp3gain,glensawyer
6,CVE-2019-13183,689916,flarum,flarum,0.1.0,"Flarum before 0.1.0-beta.9 allows CSRF against all POST endpoints, as demonstrated by changing admin settings.",cpe:2.3:a:flarum:flarum:0.1.0:beta8.1:*:*:*:*:*:*,1,1,"[[], [flarum], [before 0.1.0-beta.9]]","[[], [0.9974808], [0.99994767]]",[],[flarum],[before 0.1.0-beta.9],[],[0.9974808],[0.99994767],[],[],[flarum],[0.9974808],1,flarum,flarum
7,CVE-2018-15121,522169,auth0,aspnet,-,An issue was discovered in Auth0 auth0-aspnet and auth0-aspnet-owin. Affected packages do not use or validate the state parameter of the OAuth 2.0 and OpenID Connect protocols. This leaves applica...,cpe:2.3:a:auth0:aspnet:-:*:*:*:*:*:*:*,1,1,"[[auth0], [auth0-aspnet, oauth, openid connect], [2.0]]","[[0.99983096], [0.92818856, 0.9774277, 0.88269305], [0.9996136]]",[auth0],"[auth0-aspnet, oauth, openid connect]",[2.0],[0.99983096],"[0.92818856, 0.9774277, 0.88269305]",[0.9996136],[auth0],[0.99983096],[oauth],[0.9774277],0,oauth,atlassian
8,CVE-2013-2175,549900,haproxy,haproxy,1.4.17,"HAProxy 1.4 before 1.4.24 and 1.5 before 1.5-dev19, when configured to use hdr_ip or other ""hdr_*"" functions with a negative occurrence count, allows remote attackers to cause a denial of service ...",cpe:2.3:a:haproxy:haproxy:1.4.17:*:*:*:*:*:*:*,1,1,"[[], [haproxy], [1.4 before 1.4.24, 1.5 before 1.5-dev19,]]","[[], [0.9999064], [0.9999666, 0.9999676]]",[],[haproxy],"[1.4 before 1.4.24, 1.5 before 1.5-dev19,]",[],[0.9999064],"[0.9999666, 0.9999676]",[],[],[haproxy],[0.9999064],0,haproxy,netgate
9,CVE-2016-10714,422757,zsh,zsh,4.2.2,"In zsh before 5.3, an off-by-one error resulted in undersized buffers that were intended to support PATH_MAX characters.",cpe:2.3:a:zsh:zsh:4.2.2:*:*:*:*:*:*:*,1,1,"[[], [zsh], [before 5.3,]]","[[], [0.9995414], [0.99995625]]",[],[zsh],"[before 5.3,]",[],[0.9995414],[0.99995625],[],[],[zsh],[0.9995414],1,zsh,zsh
10,CVE-2023-4393,375964,liquidfiles,liquidfiles,1.6.23,"HTML and SMTP injections on the registration page of LiquidFiles versions 3.7.13 and below, allow an attacker to perform more advanced phishing attacks against an organization.",cpe:2.3:a:liquidfiles:liquidfiles:1.6.23:*:*:*:*:*:*:*,1,1,"[[], [liquidfiles], [3.7.13 and below,]]","[[], [0.99991], [0.83767396]]",[],[liquidfiles],"[3.7.13 and below,]",[],[0.99991],[0.83767396],[],[],[liquidfiles],[0.99991],0,liquidfiles,liquidfiles
12,CVE-2022-3768,178195,wpsmartcontracts,wpsmartcontracts,1.2.2,"The WPSmartContracts WordPress plugin before 1.3.12 does not properly sanitise and escape a parameter before using it in a SQL statement, leading to a SQL injection exploitable by users with a rol...",cpe:2.3:a:wpsmartcontracts:wpsmartcontracts:1.2.2:*:*:*:*:wordpress:*:*,1,1,"[[], [wpsmartcontracts, wordpress], [before 1.3.12]]","[[], [0.97421944, 0.99314076], [0.9999514]]",[],"[wpsmartcontracts, wordpress]",[before 1.3.12],[],"[0.97421944, 0.99314076]",[0.9999514],[],[],[wordpress],[0.99314076],1,wordpress,wordpress
13,CVE-2014-4700,719860,citrix,xendesktop,4.0,"Citrix XenDesktop 7.x, 5.x, and 4.x, when pooled random desktop groups is enabled and ShutdownDesktopsAfterUse is disabled, allows local guest users to gain access to another user's desktop via un...",cpe:2.3:a:citrix:xendesktop:4.0:*:*:*:*:*:*:*,1,1,"[[citrix], [xendesktop], [7.x,, 5.x,, 4.x,]]","[[0.999882], [0.9998907], [0.9999678, 0.9999691, 0.9999697]]",[citrix],[xendesktop],"[7.x,, 5.x,, 4.x,]",[0.999882],[0.9998907],"[0.9999678, 0.9999691, 0.9999697]",[citrix],[0.999882],[xendesktop],[0.9998907],0,xendesktop,citrix
14,CVE-2015-8076,582232,cyrus,imap,2.5.2,"The index_urlfetch function in index.c in Cyrus IMAP 2.3.x before 2.3.19, 2.4.x before 2.4.18, 2.5.x before 2.5.4 allows remote attackers to obtain sensitive information or possibly have unspecifi...",cpe:2.3:a:cyrus:imap:2.5.2:*:*:*:*:*:*:*,1,1,"[[], [cyrus imap], [2.3.x before 2.3.19,, 2.4.x before 2.4.18,, 2.5.x before 2.5.4]]","[[], [0.9998886], [0.9999668, 0.99996823, 0.9999669]]",[],[cyrus imap],"[2.3.x before 2.3.19,, 2.4.x before 2.4.18,, 2.5.x before 2.5.4]",[],[0.9998886],"[0.9999668, 0.99996823, 0.9999669]",[],[],[cyrus imap],[0.9998886],1,cyrus_imap,cyrusimap
15,CVE-2019-14862,692470,knockoutjs,knockout,1.2.1,"There is a vulnerability in knockout before version 3.5.0-beta, where after escaping the context of the web application, the web application delivers data to its users along with other trusted dyn...",cpe:2.3:a:knockoutjs:knockout:1.2.1:*:*:*:*:*:*:*,0,1,"[[], [knockout], [before version 3.5.0-beta,]]","[[], [0.9442648], [0.9837834]]",[],[knockout],"[before version 3.5.0-beta,]",[],[0.9442648],[0.9837834],[],[],[knockout],[0.9442648],1,knockout,knockoutjs


In [19]:
def extract_version(matched):
        if matched:
            version = matched.group('version')
            # Normalize separators (replace '-' with '.' if needed)
            version = version.replace('-', '.')
            return version
        return None

def classify_version_string(version_str):
    """
    Generate all versions for expressions containing 'before'.
    Handles two cases:
    1) "before X.Y.Z" - generates all versions up to X.Y.Z
    2) "A.B.x before A.B.C" - generates all patch versions A.B.0 to A.B.(C-1)
    """
    version_str = str.lower(version_str)



    # through, including
    group_name = 'through'
    group_words = ['through', 'earlier', '<=', 'prior', 'up to', 'up to, and including', 'up to and including', 'older']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '2.1 through 3.17'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:through|earlier|prior|\<\=|up to)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            pattern = (
                r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
            )
            matched = re.search(pattern, version_str, re.IGNORECASE)
            return [extract_version(matched)], f'{group_name} group'

    # before, not including
    group_name = 'before'
    group_words = ['before', '<']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '4.2.x before 4.2.8'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:before)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            # if simple logic
            pattern = (
                r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
            )
            matched = re.search(pattern, version_str, re.IGNORECASE)
            return [extract_version(matched)], f'{group_name} group'

    # after, including
    group_name = 'after'
    group_words = ['after', '>=']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '4.2.x before 4.2.8'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:older|after|\>\=)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            # if simple logic
            pattern = (
                r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
            )
            matched = re.search(pattern, version_str, re.IGNORECASE)
            return [extract_version(matched)], f'{group_name} group'

    # between
    group_name = 'between'
    group_words = ['between', 'to', ' - ']
    for group_wrd in group_words:
        if group_wrd in version_str:
            # More complicated multi version logic
            # '4.2.x before 4.2.8'
            multi_version_pattern = r'(?P<version1>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)\s*' \
                        r'(?:between|to)\s*' \
                        r'(?P<version2>[\dxX]+(?:\s*[.-]\s*[\dxX]+)*)'
            multi_match = re.search(multi_version_pattern, version_str, re.IGNORECASE)
            if multi_match:
                version1 = multi_match.group('version1')
                version2 = multi_match.group('version2')
                return [version1, version2], f'{group_name} multi-match'

            # # if simple logic
            # pattern = (
            #     r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)\s*)'  # Version with digits/x and separators
            # )
            # matched = re.search(pattern, version_str, re.IGNORECASE)
            # return extract_version(matched), f'{group_name} group'



    pattern = (
        r'(?:v|version)?\s*'  # Optional 'v' or 'version'
        r'(?P<version>[\dxX]+(?:[.-]\s*[\dxX]+)*)'  # Version with digits/x and separators
    )
    matched = re.search(pattern, version_str, re.IGNORECASE)
    return [extract_version(matched)], 'other'

In [20]:
def parse_version(version_str):
    components = re.findall(r'\d+|x', version_str, re.IGNORECASE)
    parsed = []
    for c in components:
        if c.lower() == 'x':
            parsed.append('x')
        else:
            parsed.append(int(c))
    return parsed

In [129]:
generate_versions('2.5.1.5', 'before')

[]

In [21]:
def generate_versions(versions, group_name, debug=False):
    if group_name == 'other':
        result = [versions[0]]
        version_other = parse_version(versions[0])
        while len(version_other) != 3:
            if len(version_other) > 3:
                result.append('.'.join([str(x) for x in version_other]))
                version_other.pop()
            elif len(version_other) < 3:
                result.append('.'.join([str(x) for x in version_other]))
                version_other.append(0)
        else:
            result.append('.'.join([str(x) for x in version_other]))
        # print(f'result: {result}')
        # print(f'other versions: {other_versions}')
        # result_merged = result + other_versions
        # print(f'joined: {result_merged}')
        return result
        # return versions
    group_type = group_name.split()[0].lower()

    if len(versions) == 1:
        if group_type == 'before':
            return generate_versions(['0.0.0', versions[0]], 'before multi-match', debug=debug)
        elif group_type == 'through':
            return generate_versions(['0.0.0', versions[0]], 'through multi-match', debug=debug)
        # here access DB and query max version?
        elif group_type == 'after':
            return generate_versions([versions[0], '20.0.0'], 'after multi-match', debug=debug)
        else:
            return []
    elif len(versions) >= 1:
        # for ['3.x', '3.1.1']
        # 3.x
        start = parse_version(versions[0])
        len_original_start = len(start)
        # 3.1.1
        end = parse_version(versions[1])
        len_original_end = len(end)
        # normalize versions
        while len(start) != 3:
            if len(start) > 3:
                start.pop()
            elif len(start) < 3:
                start.append(0)

        while len(end) != 3:
            if len(end) > 3:
                end.pop()
            elif len(end) < 3:
                end.append(0)

        possible_values = []
        if debug:
            print(f'start version: {start}, end version: {end}')
            print(f'len_original_end: {len_original_end}')
        for i in range(3):
            # print(f'possible values: {possible_values}')
            # 3
            start_comp = start[i]
            # 3
            end_comp = end[i]
            if debug:
                print(f'Start component: {start_comp}, End component: {end_comp}')

            if start_comp == 'x':
                # Надо как-то проверять, нужно ли генерировать такик большие числа версий
                if 'before' in group_type:
                    max_val = end_comp - 1 if isinstance(end_comp, int) else 99
                else:
                    max_val = end_comp if isinstance(end_comp, int) else 99
                possible_values.append(list(range(0, max_val + 1)))

                continue
            if isinstance(start_comp, int):
                if isinstance(end_comp, str) and end_comp.lower() == 'x':
                    end_comp = 99  # High maximum for 'x' in end
                if start_comp > end_comp:
                    return []
                if start_comp < end_comp:
                    if 'before' in group_type:
                        current_max = end_comp - 1
                    else:
                        current_max = end_comp
                    # possible_values.append(list(range(start_comp, current_max + 1)))
                    possible_values.append(list(range(start_comp, 10)))

                    # Allow any values for remaining components
                    for j in range(i + 1, 3):
                        possible_values.append(list(range(0, 100)))  # Arbitrary high limit
                    break
                else:
                    possible_values.append([start_comp])
            else:
                # print(f'possible values: {possible_values}')
                possible_values.append([0])


        if debug:
            print(f'possible values: {possible_values}')
        if 'x' not in end and 'x' not in start:
            generated_components = list(product(*possible_values))
            if debug:
                print(f'generated components: {generated_components[:10]}')
            generated_components_to_use = []
            for val in generated_components:
                if not (((val[0] == end[0]
                        and val[1] > end[1]) or
                        (val[0] == end[0]
                        and val[1] == end[1]
                        and val[2] > end[2]) or
                        val[0] > end[0])
                    or ((val[0] == start[0]
                         and val[1] < start[1]) or
                        (val[0] == start[0]
                         and val[1] == start[1]
                         and val[2] < start[2])) or
                        val[0] < start[0]):
                    generated_components_to_use.append(val)
            if debug:
                print(f'generated components to use: {generated_components_to_use[:10], generated_components_to_use[-10:]}')
            versions_list = ['.'.join(map(str, v)) for v in generated_components_to_use]

            # return versions_list
        else:
            generated_components = list(product(*possible_values))
            versions_list = ['.'.join(map(str, v)) for v in generated_components]
        if len_original_end == 2 or len_original_start == 2:
            versions_set = []
            for x in versions_list:
                versions_set.append(x.split('.')[:2])
            versions_set = set(['.'.join(y) for y in versions_set])
            # for x in versions_list:
            versions_list.extend(list(versions_set))
        if debug:
            print(versions_list[:10])
        return versions_list
    else:
        # print('last else')
        return []

In [22]:
r = []
d_vers = {}
cve_to_vers = {}
d = {}
for i, row in df_test.iterrows():
    possible_versions = []
    # print(i, row['version_ner'])
    for version_ner in row['version_ner']:
        # print(version_ner)
        preprocessed_ner = classify_version_string(version_ner)
        if preprocessed_ner[0][0] is None:
            continue
        # print(preprocessed_ner, end='\n'+'*'*50+'\n')
        generated_versions = generate_versions(*preprocessed_ner)
        possible_versions.extend(generated_versions)
    r.append(1 if row['version'] in possible_versions else 0)
    # d_vers[(row['version'], row['cve_id'])] = 
    cve_to_vers[row['cve_id']] = row['version']
    d[row['cve_id']] = possible_versions

In [133]:
# for i, row in df_test.iterrows():
#     possible_versions = []
#     # print(i, row['version_ner'])
#     for version_ner in row['version_ner']:
#         # print(version_ner)
#         preprocessed_ner = classify_version_string(version_ner)
#         print(row['version_ner'], preprocessed_ner)

In [46]:
df_test['true_version_in_predicted'] = r

In [106]:
sum(r)

113

In [24]:
count_vendor = 0
count_product = 0
for i, row in df_test.iterrows():

    if row['dedup_vendor']:
        if (row['vendor'] == row['dedup_vendor'][0]
            or row['vendor'] == '_'.join(row['dedup_vendor'][0].split())):
            count_vendor += 1
    if row['dedup_product']:
        if (row['product'] == row['dedup_product'][0]
            or row['product'] == '_'.join(row['dedup_product'][0].split())):
            count_product += 1    
        else:
            print(row['product'], row['dedup_product'])

teamspeak3 ['teamspeak']
open_source_security_information_management ['usm']
manageengine_applications_manager ['applications manager']
manageengine_o365_manager_plus ['java servlet']
aspnet ['oauth']
tagmin_control_center ['tagboard']
wpsmartcontracts ['wordpress']
imap ['cyrus imap']
filezilla_client ['paramiko']
video_conferencing_with_zoom ['zoom wordpress']
contact_form_\&_lead_form_elementor_builder ['wordpress']
yukassa_for_woocommerce ['woocommerce']
awsui\/components-react ['react']
opensaml ['opensaml-java']
monitoring_software ['serverscheck monitoring software']
koko ['jumpserver']
ssh_\&_web_terminal ['addon-ssh']
firefly_iii ['firefly']
personal_video_collection_script ['mall personal video collection script']
formidable_form_builder ['formidable forms wordpress']
owasp_antisamy_.net ['antisamy']
cling ['4thline cling']
android ['stst']
java_system_communications_express ['personal address book']
mappress ['wordpress']
updraftplus ['wordpress backup']
dashboard.js ['stimu

Это те, что сразу совпали. При этом часть названий продуктов, похожи, но не точно. Поэтому нужно использовать поиск по строкам. 

In [25]:
print(count_vendor, count_product)

42 93


In [27]:
product_ner = df_test['dedup_product'].astype(str).apply(lambda x: x.lstrip('[\'').rstrip(']\'')).values
vendor_ner = df_test['dedup_vendor'].astype(str).apply(lambda x: x.lstrip('[\'').rstrip(']\'')).values

In [114]:
sum([1 if v in product_ner else 0 for v in df_test['product'].tolist() ])

73

In [115]:
sum([1 if v in vendor_ner else 0 for v in df_test['vendor'].tolist() ])

46

In [6]:
df_all = get_df_from_bd('select * from cpes limit 10000000;')

In [None]:
unique_products = df_all['product'].unique()
unique_vendors = df_all['vendor'].unique()

In [58]:
get_lcs('job-manager', unique_products)

('job_management_partner_1\\/it_desktop_management-manager', 11)

In [78]:
get_ratio('biteship', unique_products)

('biteship', 1)

In [92]:
def get_ratio(ner_name, unique_entities):
    ratio_scores = []
    if ner_name in unique_entities:
        return ner_name, 1
    for ent in unique_entities:
        ratio_scores.append(ratio(ner_name, ent))
    if len(np.argwhere(ratio_scores == np.max(ratio_scores))) <= 1:
        return unique_entities[np.argmax(ratio_scores)], np.max(ratio_scores)
    else:
        candidates = unique_entities[np.argwhere(ratio_scores == np.max(ratio_scores))]
        len_of_query = len(ner_name)
        d = -1
        fit_cand = ''
        for cand in candidates:
            cand = cand[0]
            diff = abs(len_of_query - len(cand))
            # print(cand, d, diff)
            if d == -1:
                d = diff
                fit_cand = cand
            elif d > diff:
                d = diff
                fit_cand = cand
            else:
                continue
        return fit_cand, np.max(ratio_scores)

In [93]:
def get_lcs(ner_name, unique_entities):
    if ner_name in unique_entities:
        return ner_name, 1
    lcs_scores = np.array(pylcs.lcs_of_list(ner_name, unique_entities))
    if len(np.argwhere(lcs_scores == np.max(lcs_scores))) <= 1:
        return unique_entities[np.argmax(lcs_scores)], np.max(lcs_scores)
    else:
        candidates = unique_entities[np.argwhere(lcs_scores == np.max(lcs_scores))]
        len_of_query = len(ner_name)
        d = -1
        fit_cand = ''
        for cand in candidates:
            cand = cand[0]
            diff = abs(len_of_query - len(cand))
            print(cand, d, diff)
            if d == -1:
                d = diff
                fit_cand = cand
            elif d > diff:
                d = diff
                fit_cand = cand
            else:
                continue
        return fit_cand, np.max(lcs_scores)


In [94]:
(prod, score)= get_lcs('serverscheck monitoring software', unique_products)
print(prod, score)
df_all = get_df_from_bd(f"select * from cpes where product = '{prod}' limit 1;")
df_all

telepresence_video_communication_server_software -1 16
zfs_storage_application_integration_engineering_software 16 24
telepresence_video_communication_servers_software 16 17
telepresence_video_communication_server_software 21


Unnamed: 0,cpe_id_pk,cpe_version,part,vendor,product,version,update,edition,sw_edition,target_sw,target_hw,language,other,initial_cpe
0,571804,2.3,a,cisco,telepresence_video_communication_server_software,x8.5,rc4,,,,,,,cpe:2.3:a:cisco:telepresence_video_communication_server_software:x8.5:rc4:*:*:*:*:*:*


In [64]:
matched_db_product = []
matched_db_vendor = []
for pr in product_ner:
    if pr:
        print(f'Product NER: {pr}')
        ########################################################
        # (prod, score)= get_lcs_nonnorm(pr, unique_products)
        # print('Old')
        # print(f'Found product in DB: {prod}')
        # print(f'score: {score}')
        # print('Normed')
        ########################################################
        (prod, score)= get_lcs(pr, unique_products)
        print(f'Found product in DB: {prod}')
        print(f'score: {score}', end='\n\n')
        df_all = get_df_from_bd(f"select * from cpes where product = '{prod}' limit 1;")
        matched_db_vendor.append(df_all['vendor'].values[0])
        matched_db_product.append(df_all['product'].values[0])
    else:
        matched_db_product.append('')
        matched_db_vendor.append('')

Product NER: mp3gain
Found product in DB: mp3gain
score: 1

Product NER: teamspeak
Found product in DB: teamspeak
score: 1

Product NER: usm
Found product in DB: usermin
score: 3

Product NER: applications manager
Found product in DB: applications_manager
score: 19

Product NER: java servlet
Found product in DB: java_communications_services_delegated_administrator
score: 11

Product NER: flarum
Found product in DB: flarum
score: 1

Product NER: oauth
Found product in DB: oauth
score: 1

Product NER: haproxy
Found product in DB: haproxy
score: 1

Product NER: zsh
Found product in DB: zsh
score: 1

Product NER: liquidfiles
Found product in DB: liquidfiles
score: 1

Product NER: tagboard
Found product in DB: tagboard
score: 1

Product NER: wordpress
Found product in DB: wordpress
score: 1

Product NER: xendesktop
Found product in DB: xendesktop
score: 1

Product NER: cyrus imap
Found product in DB: cyrus_imap
score: 9

Product NER: knockout
Found product in DB: knockout
score: 1

Product 

In [80]:
product_ner

array(['mp3gain', 'teamspeak', 'usm', 'applications manager',
       'java servlet', '', 'flarum', 'oauth', 'haproxy', 'zsh',
       'liquidfiles', 'tagboard', 'wordpress', 'xendesktop', 'cyrus imap',
       'knockout', '', 'paramiko', 'wpdatatables', 'wp email capture',
       'pycryptodome', 'mantis', 'zoom wordpress', 'ewelink',
       'display-widgets', 'xpdf', 'wordpress', 'woocommerce', 'geocoder',
       'terminal-kit', '', 'react', 'fast-xml-parser', '', '',
       'opensaml-java', '', 'serverscheck monitoring software', 'xymon',
       'quiz maker', 'chrome', 'jumpserver', 'addon-ssh', '', 'firefly',
       'phpok', 'mall personal video collection script', 'madwifi',
       'formidable forms wordpress', 'openreplay', 'browserify-sign',
       'antisamy', '', '', 'bold page builder', '4thline cling',
       'claroline', 'ntp', 'mwembed', 'getgo download manager', 'postie',
       '', 'bolt', 'stst', '', '', '', '', '', '', '',
       'personal address book', 'duckduckgo', 'word

In [None]:
matched_db_product_lev = []
matched_db_vendor_lev = []
score_lev = []
for pr in tqdm(product_ner):
    if pr:
        print(f'Product NER: {pr}')
        ########################################################
        # (prod, score)= get_lcs_nonnorm(pr, unique_products)
        # print('Old')
        # print(f'Found product in DB: {prod}')
        # print(f'score: {score}')
        # print('Normed')
        ########################################################
        (prod, score)= get_ratio(pr, unique_products)
        print(f'Found product in DB: {prod}')
        print(f'score: {score}', end='\n\n')
        df_all = get_df_from_bd(f"select * from cpes where product = '{prod}' limit 1;")
        matched_db_product_lev.append(df_all['product'].values[0])
        matched_db_vendor_lev.append(df_all['vendor'].values[0])
        score_lev.append(score)
        
    else:
        matched_db_product_lev.append('')
        matched_db_vendor_lev.append('')
        score_lev.append(0)

In [None]:
df_test[['cve_id', 'product', 'matched_db_product', 'version']].replace(columns={'product': ''})

Unnamed: 0,cve_id,product,matched_db_product,version
0,CVE-2023-0878,nuxt,cosminexus_portal_framework,1.0.0
1,CVE-2008-0444,elog,electronic_logbook,2.2.0
2,CVE-2020-14518,dreammapper,dreammapper,2.17.1
3,CVE-2022-27858,activity_log,the_university_of_cambridge_web_authentication_system_apache_authentication_agent,2.2.8
4,CVE-2022-4725,aws_software_development_kit,drawings_sdk,2.2.4
5,CVE-2021-32691,data-connector-rock,"woocommerce_pdf_invoices\,_packing_slips\,_delivery_notes_and_shipping_labels",1.2.5
6,CVE-2016-4572,cdh,cloudera_cdh,5.5.1
7,CVE-2023-31441,advisor_network,communications_unified_inventory_management,2.2.0
8,CVE-2023-42261,mobile_security_framework,mobile_security_framework,0.9.4.1
9,CVE-2022-36532,bolt_cms,bolt_cms,3.0.2


In [75]:
df_test['matched_db_product'] = matched_db_product
df_test['matched_db_vendor'] = matched_db_vendor
df_test['matched_db_product_lev'] = matched_db_product_lev
df_test['matched_db_vendor_lev'] = matched_db_vendor_lev
df_test['score_lev'] = score_lev

In [None]:
fin = df_test[(df_test['matched_db_product'] == df_test['product']) & 
        (df_test['matched_db_vendor'] == df_test['vendor']) & 
        (df_test['true_version_in_predicted'] == 1)]
fin

Unnamed: 0,cve_id,cpe_id_pk,vendor,product,version,descr,initial_cpe,ners_list,scores_list,vendor_ner,product_ner,version_ner,vendor_score_ner,product_score_ner,version_score_ner,dedup_vendor,dedup_vendor_score,dedup_product,dedup_product_score,matched_db_product,matched_db_vendor,true_version_in_predicted
7,CVE-2023-31441,341885,ncia,advisor_network,2.2.0,"In NATO Communications and Information Agency anet (aka Advisor Network) through 3.3.0, an attacker can provide a crafted JSON file to sanitizeJson and cause an exception. This is related to the U...",cpe:2.3:a:ncia:advisor_network:2.2.0:*:*:*:*:*:*:*,"[[nato], [advisor network)], [through 3.3.0,]]","[[0.98350936], [0.7389296], [0.999821]]",[nato],[advisor network)],"[through 3.3.0,]",[0.98350936],[0.7389296],[0.999821],[nato],[0.98350936],[advisor network)],[0.7389296],advisor_network,ncia,1
12,CVE-2023-35094,353966,mpembed,wp_matterport_shortcode,1.7.1,Auth. (contributor+) Stored Cross-Site Scripting (XSS) vulnerability in Julien Berthelot / MPEmbed WP Matterport Shortcode plugin <= 2.1.4 versions.,cpe:2.3:a:mpembed:wp_matterport_shortcode:1.7.1:*:*:*:*:wordpress:*:*,"[[], [wp matterport shortcode plugin], [<= 2.1.4]]","[[], [0.8236863], [0.82940006]]",[],[wp matterport shortcode plugin],[<= 2.1.4],[],[0.8236863],[0.82940006],[],[],[wp matterport shortcode plugin],[0.8236863],wp_matterport_shortcode,mpembed,1
26,CVE-2015-8360,582917,atlassian,bamboo,2.5.5,An unspecified resource in Atlassian Bamboo before 5.9.9 and 5.10.x before 5.10.0 allows remote attackers to execute arbitrary Java code via serialized data to the JMS port.,cpe:2.3:a:atlassian:bamboo:2.5.5:*:*:*:*:*:*:*,"[[atlassian], [bamboo], [before 5.9.9, 5.10.x before 5.10.0]]","[[0.9997923], [0.99958223], [0.99993604, 0.9999495]]",[atlassian],[bamboo],"[before 5.9.9, 5.10.x before 5.10.0]",[0.9997923],[0.99958223],"[0.99993604, 0.9999495]",[atlassian],[0.9997923],[bamboo],[0.99958223],bamboo,atlassian,1
30,CVE-2021-43009,646632,opservices,opmon,9.9,A Cross Site Scripting (XSS) vulnerability exists in OpServices OpMon through 9.11 via the search parameter in the request URL.,cpe:2.3:a:opservices:opmon:9.9:*:*:*:*:*:*:*,"[[], [opmon], [through 9.11]]","[[], [0.9972307], [0.9998847]]",[],[opmon],[through 9.11],[],[0.9972307],[0.9998847],[],[],[opmon],[0.9972307],opmon,opservices,1
34,CVE-2022-24124,114598,casbin,casdoor,1.7.2,"The query API in Casdoor before 1.13.1 has a SQL injection vulnerability related to the field and value parameters, as demonstrated by api/get-organizations.",cpe:2.3:a:casbin:casdoor:1.7.2:*:*:*:*:*:*:*,"[[], [casdoor], [before 1.13.1]]","[[], [0.9969404], [0.9999317]]",[],[casdoor],[before 1.13.1],[],[0.9969404],[0.9999317],[],[],[casdoor],[0.9969404],casdoor,casbin,1
35,CVE-2023-1554,298377,fullworksplugins,quick_paypal_payments,5.7.4,"The Quick Paypal Payments WordPress plugin before 5.7.26.4 does not sanitise and escape some of its settings, which could allow high privilege users such as admin to perform Stored Cross-Site Scri...",cpe:2.3:a:fullworksplugins:quick_paypal_payments:5.7.4:*:*:*:*:wordpress:*:*,"[[], [quick paypal payments, wordpress], [before 5.7.26.4]]","[[], [0.9503791, 0.8185743], [0.999895]]",[],"[quick paypal payments, wordpress]",[before 5.7.26.4],[],"[0.9503791, 0.8185743]",[0.999895],[],[],[quick paypal payments],[0.9503791],quick_paypal_payments,fullworksplugins,1
37,CVE-2016-9132,443587,botan_project,botan,1.9.13,"In Botan 1.8.0 through 1.11.33, when decoding BER data an integer overflow could occur, which would cause an incorrect length field to be computed. Some API callers may use the returned (incorrect...",cpe:2.3:a:botan_project:botan:1.9.13:*:*:*:*:*:*:*,"[[], [botan], [1.8.0 through 1.11.33,]]","[[], [0.9931213], [0.9999437]]",[],[botan],"[1.8.0 through 1.11.33,]",[],[0.9931213],[0.9999437],[],[],[botan],[0.9931213],botan,botan_project,1
48,CVE-2023-24998,315104,apache,commons_fileupload,1.2.2,Apache Commons FileUpload before 1.5 does not limit the number of request parts to be processed resulting in the possibility of an attacker triggering a DoS with a malicious upload or series of up...,cpe:2.3:a:apache:commons_fileupload:1.2.2:*:*:*:*:*:*:*,"[[apache], [commons fileupload], [before 1.5]]","[[0.9957885], [0.659995], [0.9999245]]",[apache],[commons fileupload],[before 1.5],[0.9957885],[0.659995],[0.9999245],[apache],[0.9957885],[commons fileupload],[0.659995],commons_fileupload,apache,1
50,CVE-2022-24071,114353,navercorp,whale,1.9.1,A Built-in extension in Whale browser before 3.12.129.46 allows attackers to compromise the rendering process which could lead to controlling browser internal APIs.,cpe:2.3:a:navercorp:whale:1.9.1:*:*:*:*:iphone_os:*:*,"[[], [whale], [before 3.12.129.46]]","[[], [0.6959183], [0.9998584]]",[],[whale],[before 3.12.129.46],[],[0.6959183],[0.9998584],[],[],[whale],[0.6959183],whale,navercorp,1
56,CVE-2012-4524,251986,sillycycle,xlockmore,3.0,xlockmore before 5.43 'dclock' security bypass vulnerability,cpe:2.3:a:sillycycle:xlockmore:3.0:*:*:*:*:*:*:*,"[[], [xlockmore], [before 5.43]]","[[], [0.99722517], [0.99928296]]",[],[xlockmore],[before 5.43],[],[0.99722517],[0.99928296],[],[],[xlockmore],[0.99722517],xlockmore,sillycycle,1


In [None]:
preprocessed_ner = classify_version_string('before 5.43')
generated_versions = generate_versions(*preprocessed_ner)

In [None]:
generated_versions[10000:10020]

['1.0.0', '1.0.1', '1.0.2', '1.0.3', '1.0.4', '1.0.5', '1.0.6', '1.0.7', '1.0.8', '1.0.9', '1.0.10', '1.0.11', '1.0.12', '1.0.13', '1.0.14', '1.0.15', '1.0.16', '1.0.17', '1.0.18', '1.0.19']

In [None]:
created_cpe = []
for v in generated_versions:
    if v[0] == '1':
        created_cpe.append(f'cpe:2.3:a:sillycycle:xlockmore:{v}:*:*:*:*:*:*:*')

In [67]:
df_test[(df_test['matched_db_product'] == df_test['product']) &
        (df_test['matched_db_vendor'] == df_test['vendor']) &
        (df_test['true_version_in_predicted'] == 1)].sort_values('cve_id')[['cve_id', 'vendor', 'matched_db_vendor', 'product', 'matched_db_product', 'version']]

Unnamed: 0,cve_id,vendor,matched_db_vendor,product,matched_db_product,version
47,CVE-2007-2829,madwifi,madwifi,madwifi,madwifi,0.9.2
145,CVE-2007-5391,hp,hp,select_identity,select_identity,4.1.8
151,CVE-2007-5630,bbsprocess,bbsprocess,bbportals,bbportals,1.6.2
168,CVE-2008-3168,empire_server,empire_server,empire_server,empire_server,4.2.20
185,CVE-2008-5335,php-fusion,php-fusion,php-fusion,php-fusion,6.01.15
102,CVE-2013-1881,gnome,gnome,librsvg,librsvg,2.18.2
99,CVE-2013-4580,gitlab,gitlab,gitlab,gitlab,4.0.0
103,CVE-2014-2341,cubecart,cubecart,cubecart,cubecart,5.2.3
170,CVE-2014-4301,ajenti,ajenti,ajenti,ajenti,1.2.6
98,CVE-2015-9277,mailenable,mailenable,mailenable,mailenable,6.5


In [100]:
df_test[(df_test['matched_db_product_lev'] == df_test['product']) &
        (df_test['matched_db_vendor_lev'] == df_test['vendor']) &
        (df_test['true_version_in_predicted'] == 1)].sort_values('cve_id')[['cve_id', 'vendor', 'matched_db_vendor_lev', 'matched_db_vendor' , 'product', 'matched_db_product_lev', 'matched_db_product', 'version', 'score_lev']]

Unnamed: 0,cve_id,vendor,matched_db_vendor_lev,matched_db_vendor,product,matched_db_product_lev,matched_db_product,version,score_lev
47,CVE-2007-2829,madwifi,madwifi,madwifi,madwifi,madwifi,madwifi,0.9.2,1.0
145,CVE-2007-5391,hp,hp,hp,select_identity,select_identity,select_identity,4.1.8,0.933333
151,CVE-2007-5630,bbsprocess,bbsprocess,bbsprocess,bbportals,bbportals,bbportals,1.6.2,1.0
168,CVE-2008-3168,empire_server,empire_server,empire_server,empire_server,empire_server,empire_server,4.2.20,0.923077
185,CVE-2008-5335,php-fusion,php-fusion,php-fusion,php-fusion,php-fusion,php-fusion,6.01.15,1.0
119,CVE-2011-2727,tribiq,tribiq,oracle,tribiq_cms,tribiq_cms,financial_services_basel_regulatory_capital_basic,5.0.9,0.666667
191,CVE-2012-6713,wp-jobmanager,wp-jobmanager,hitachi,job_manager,job_manager,job_management_partner_1\/it_desktop_management-manager,0.7.8,0.909091
102,CVE-2013-1881,gnome,gnome,gnome,librsvg,librsvg,librsvg,2.18.2,0.923077
99,CVE-2013-4580,gitlab,gitlab,gitlab,gitlab,gitlab,gitlab,4.0.0,1.0
103,CVE-2014-2341,cubecart,cubecart,cubecart,cubecart,cubecart,cubecart,5.2.3,1.0


In [104]:
df_test[
        (df_test['matched_db_product_lev'] != df_test['product'])].sort_values('cve_id')[['cve_id', 'vendor', 'matched_db_vendor_lev', 'matched_db_vendor', 'product', 'product_ner', 'matched_db_product_lev', 'matched_db_product', 'version', 'score_lev']]

Unnamed: 0,cve_id,vendor,matched_db_vendor_lev,matched_db_vendor,product,product_ner,matched_db_product_lev,matched_db_product,version,score_lev
172,CVE-2004-0095,mcafee,network_associates,network_associates,epolicy_orchestrator,[epolicy orchestrator agent],epolicy_orchestrator_agent,epolicy_orchestrator_agent,3.6.0,0.923077
125,CVE-2005-2631,cisco,ibm,wpruby,network_admission_control_manager_and_server_system_software,[clean access],client_access,controlled_admin_access,3.5.2,0.8
11,CVE-2006-5093,paul_schudar,tagit,tagit,tagmin_control_center,[tagboard],tagboard,tagboard,2.1.b_build_2,1.0
80,CVE-2006-6274,expinion.net,,,news_manager,[],,,,0.0
52,CVE-2007-0851,trend_micro,,,web_security_suite,[],,,1.2.0,0.0
120,CVE-2007-3381,gnome,dell,dell,gdm,[display manager],display_manager,display_manager,2.2,0.933333
71,CVE-2009-1729,sun,sb-websoft,professional_home_page_tools,java_system_communications_express,"[java system communications, personal address book]",addressbook,professional_home_page_tools_guestbook,6.3,0.6875
68,CVE-2012-4524,sillycycle,,,xlockmore,[],,,2.3,0.0
5,CVE-2013-3607,supermicro,,,x9dax-if,[],,,-,0.0
35,CVE-2013-6440,internet2,shibboleth,shibboleth,opensaml,[opensaml-java],opensaml_java,opensaml_java,2.2.0,0.923077


* Для CVE-2007-6487 продукта webgui есть вендор plainblack и plain_black в БД

* Для CVE-2004-0095 есть продукт epolicy_orchestrator_agent и epolicy_orchestrator в БД

* Для CVE-2007-3381 продукт gdm -- абревиатура, не найти такой продукт в БД

* CVE-2013-6440 есть продукт opensaml и opensaml_java в БД

* CVE-2014-7221 есть продукт teamspeak и teamspeak3 в БД

* CVE-2020-15003 дубли продукта в БД open-xchange_appsuite и ox_app_suite, оба версии 7.10.5
7.10.5

In [None]:
epolicy_orchestrator

In [None]:
'job_manager' in unique_products

True