Skip to content

Commit

Permalink
feat: find common strings in CONTAINS_PATTERNS from helper_scripts.py (
Browse files Browse the repository at this point in the history
…closes #1353) (#1586)

* closes #1353
  • Loading branch information
rhythmrx9 committed Mar 10, 2022
1 parent 1635fa4 commit 9c73442
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 38 deletions.
21 changes: 20 additions & 1 deletion cve_bin_tool/checkers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -278,13 +278,16 @@ that include this product. For our example all listings except
example SQL query).

## Helper-Script
Helper-Script is a tool that takes *packages*(i.e. busybox_1.30.1-4ubuntu9_amd64.deb) as input and returns:
Helper-Script is a tool that takes a *package*(i.e. busybox_1.30.1-4ubuntu9_amd64.deb) as input and returns:

> 1. `CONTAINS_PATTERNS` - list of commonly found strings in the binary of the product
> 2. `FILENAME_PATTERNS` - list of different filename for the product
> 3. `VERSION_PATTERNS` - list of version patterns found in binary of the product.
> 4. `VENDOR_PRODUCT` - list of vendor product pairs for the product as they appear in NVD.
Helper-Script can also take multiple packages and `PRODUCT_NAME`(required) as input and return
common strings for `CONTAINS_PATTERNS`.

Usage: `python -m cve_bin_tool.helper_script`

```
Expand Down Expand Up @@ -357,6 +360,22 @@ class BusyboxChecker(Checker):

Try this against a few more `busybox` packages across different `distros` and see which strings are common among the following. Then follow the above steps to create the checker.

To get common strings for `CONTAINS_PATTERNS` in multiple `busybox` packages, we can use the script like this:

```
windows > python3 -m cve_bin_tool.helper_script busybox_1.30.1-4ubuntu6_amd64.deb busybox-1.33.0-3.fc34.x86_64.rpm --product busybox
linux $ python3 -m cve_bin_tool.helper_script busybox_1.30.1-4ubuntu6_amd64.deb busybox-1.33.0-3.fc34.x86_64.rpm --product busybox
─────────────────────────────────────────────────────── Common CONTAINS_PATTERNS strings for BusyboxChecker──────────────────────────
class BusyboxChecker(Checker):
CONTAINS_PATTERNS = [
r"BusyBox is a multi-call binary that combines many common Unix",
r"BusyBox is copyrighted by many authors between 1998-2015.",
r"link to busybox for each function they wish to use and BusyBox",
]
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
```

> _***NOTE:*** If you look at our existing checkers, you'll see that some strings are commented out in `CONTAINS_PATTERNS`. These strings are kept there as potential strings in case if the currently used strings stop working in the future versions. If you also find more than 2-3 strings, it's recommended to comment them out for future reference._
Currently, if you receive multiple vendor-product pairs, select the appropriate vendor-product pair from the following pairs obtained manually. In this case, it is `[('busybox', 'busybox')]`.
Expand Down
132 changes: 100 additions & 32 deletions cve_bin_tool/helper_script.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,44 @@
# Copyright (C) 2021 Intel Corporation
# SPDX-License-Identifier: GPL-3.0-or-later

from __future__ import annotations

import argparse
import os
import re
import sys
import textwrap
from collections import ChainMap
from logging import Logger

from rich import print as rprint
from rich.console import Console

from cve_bin_tool.cvedb import CVEDB, DBNAME, DISK_LOCATION_DEFAULT
from cve_bin_tool.error_handler import ErrorHandler, ErrorMode, UnknownArchiveType
from cve_bin_tool.extractor import Extractor
from cve_bin_tool.extractor import Extractor, TempDirExtractorContext
from cve_bin_tool.log import LOGGER
from cve_bin_tool.util import DirWalk
from cve_bin_tool.version_scanner import VersionScanner

WARNED = False


class HelperScript:
"""Helps contributors who want to write a new cve-bin-tool checker find common filenames, version strings, and other necessary data for building a binary checker"""

CONSOLE = Console()
LOGGER = LOGGER.getChild("HelperScript")
LOGGER: Logger = LOGGER.getChild("HelperScript")

def __init__(
self, filename, product_name=None, version_number=None, string_length=40
self,
filename: str,
product_name: str | None = None,
version_number: str | None = None,
string_length: int = 40,
):
self.filename = filename
self.extractor = Extractor()
self.extractor: TempDirExtractorContext = Extractor()
self.product_name, self.version_number = self.parse_filename(filename)
if product_name:
self.product_name = product_name
Expand All @@ -45,20 +54,20 @@ def __init__(
self.walker = DirWalk().walk

# for output (would use in future)
self.contains_patterns = []
self.filename_pattern = []
self.version_pattern = []
self.vendor_product = self.find_vendor_product()
self.contains_patterns: list[str] = []
self.filename_pattern: list[str] = []
self.version_pattern: list[str] = []
self.vendor_product: list[tuple[str, str]] | None = self.find_vendor_product()

# for scanning files versions
self.version_scanner = VersionScanner()

def extract_and_parse_file(self, filename):
def extract_and_parse_file(self, filename: str) -> list[str] | None:
"""extracts and parses the file for common patterns, version strings and common filename patterns"""

with self.extractor as ectx:
if ectx.can_extract(filename):
binary_string_list = []
binary_string_list: list[str] = []
for filepath in self.walker([ectx.extract(filename)]):
clean_path = self.version_scanner.clean_file_path(filepath)
LOGGER.debug(f"checking whether {clean_path} is binary")
Expand Down Expand Up @@ -100,7 +109,7 @@ def extract_and_parse_file(self, filename):
return self.contains_patterns
return binary_string_list

def search_pattern(self, file_content, pattern):
def search_pattern(self, file_content: str, pattern: str) -> list[str]:
"""find strings for CONTAINS_PATTERNS with product_name in them"""

file_content_list = file_content.split("\n")
Expand All @@ -112,7 +121,7 @@ def search_pattern(self, file_content, pattern):
) # TODO: regex highlight in these matched strings?
return matches

def search_version_string(self, matched_list):
def search_version_string(self, matched_list: list[str]) -> list[str]:
"""finds version strings from matched list"""

# TODO: add multiline string finding
Expand Down Expand Up @@ -142,7 +151,7 @@ def search_version_string(self, matched_list):
) # TODO: regex highlight in these matched strings?
return version_strings

def parse_filename(self, filename):
def parse_filename(self, filename: str) -> tuple[str, str]:
"""
returns package_name/product_name from package_filename of types .rpm, .deb, etc.
Example: package_filename = openssh-client_8.4p1-5ubuntu1_amd64.deb
Expand All @@ -160,7 +169,7 @@ def parse_filename(self, filename):
# example: libarchive-3.5.1-1-aarch64.pkg.tar.xz
elif filename.endswith(".deb") or filename.endswith(".ipk"):
product_name = filename.rsplit("_")[0]
version_number = filename.rsplit("_")[1]
version_number = filename.rsplit("_")[1].rsplit("-")[0].rsplit("+")[0]
# example: varnish_6.4.0-3_amd64.deb
else:
product_name = filename.rsplit("-", 2)[0]
Expand All @@ -175,7 +184,7 @@ def parse_filename(self, filename):
with ErrorHandler(mode=ErrorMode.NoTrace, logger=LOGGER):
raise UnknownArchiveType(filename)

def find_vendor_product(self):
def find_vendor_product(self) -> list[tuple[str, str]] | None:
"""find vendor-product pairs from database"""

LOGGER.debug(
Expand All @@ -197,7 +206,8 @@ def find_vendor_product(self):
# checking if (vendor, product) was found in the database
if data:
# warning the user to select the vendor-product pairs manually if multiple pairs are found
if len(data) != 1:
global WARNED
if len(data) != 1 and not WARNED:
LOGGER.warning(
textwrap.dedent(
f"""
Expand All @@ -208,6 +218,7 @@ def find_vendor_product(self):
"""
)
)
WARNED = True # prevent same warning multiple times
return data # [('vendor', 'product')]
else:
if self.product_name:
Expand Down Expand Up @@ -236,7 +247,7 @@ def find_vendor_product(self):

CVEDB.db_close(self)

def output(self):
def output_single(self) -> None:
"""display beautiful output for Helper-Script"""

self.CONSOLE.rule(f"[bold dark_magenta]{self.product_name.capitalize()}Checker")
Expand Down Expand Up @@ -312,8 +323,78 @@ def output(self):

self.CONSOLE.rule()

@staticmethod
def output_common(common_strings: list[str], product_name: str) -> None:
"""display beautiful output for common strings in CONTAINS_PATTERNS"""

HelperScript.CONSOLE.rule(
f"[bold dark_magenta]Common CONTAINS_PATTERNS strings for {product_name.capitalize()}Checker"
)
rprint(f"[red]class[/] [blue]{product_name.capitalize()}Checker[/](Checker):")

print("\tCONTAINS_PATTERNS = [")
for common_string in sorted(common_strings):
if ".debug" in common_string:
rprint(
f'\t\t[red]r"{common_string}"[/] <--- not recommended to use this form of strings'
)
continue # without this, the else statement was getting printed ;-;
if ".so" in common_string:
rprint(
f'\t\t[red]r"{common_string}"[/] <--- not recommended to use this form of strings'
)
else:
rprint(f'\t\t[green]r"{common_string}"[/],')
print("\t]")
HelperScript.CONSOLE.rule()


def scan_files(args) -> None:
"""Scans file and outputs Checker class or common CONTAINS_PATTERNS depending on number of files given"""

LOGGER.debug(f"Given filenames: {args['filenames']}")
LOGGER.info("Scanning files")

hs_list: list[HelperScript] = [
HelperScript(
args["filenames"][x],
product_name=args["product_name"],
version_number=args["version_number"],
string_length=args["string_length"],
)
for x, _ in enumerate(args["filenames"])
]

if len(hs_list) > 1: # more than one files are given - output common strings

# return if product_name is not given
if not args["product_name"]:
LOGGER.error("PRODUCT_NAME not in arguments")
return None

if args["version_number"]:
LOGGER.warning(
"VERSION_NUMBER in arguments, common strings may not be found if files have different versions"
)

for hs in hs_list:
hs.extract_and_parse_file(hs.filename)

common_strings = hs_list[0].contains_patterns

def main(argv=None):
# getting common strings
for hs in hs_list:
common_strings = list(set(common_strings) & set(hs.contains_patterns))

HelperScript.output_common(common_strings, hs_list[0].product_name)

else: # one file is given
for hs in hs_list:
hs.extract_and_parse_file(hs.filename)
hs.output_single()


def main(argv=None) -> None:

argv = argv or sys.argv

Expand Down Expand Up @@ -383,20 +464,7 @@ def main(argv=None):

LOGGER.setLevel(args["log_level"].upper())

LOGGER.debug(f"Given filenames: {args['filenames']}")
LOGGER.info(f"Scanning only the first filename: '{args['filenames'][0]}'")
hs = HelperScript(
args["filenames"][0],
product_name=args["product_name"],
version_number=args["version_number"],
string_length=args["string_length"],
)

# Parsing, Extracting and Searching for version-strings
hs.extract_and_parse_file(args["filenames"][0])

# output on console
hs.output()
scan_files(args)


if __name__ == "__main__":
Expand Down
Binary file not shown.
Binary file not shown.

0 comments on commit 9c73442

Please sign in to comment.