In [7]:
import pandas as pd
import redis
from redis.commands.json.path import Path
from redis.commands.search.field import (
    NumericField,
    TagField,
    TextField,
    VectorField,
)
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from sentence_transformers import SentenceTransformer
import json
from sentence_transformers import SentenceTransformer
import numpy as np


In [8]:
redis_connection = redis.Redis(host='snf-42929.ok-kno.grnetcloud.net', 
                               port=6379,
                               # username='bf_user',
                               # password='BugFr@m3w0rk',
                               decode_responses=True)

In [9]:
papers_df = pd.read_csv('data/cleaned_db_text.csv')
model = SentenceTransformer('all-mpnet-base-v2')

In [42]:
# Constants
VECTOR_DIM = 768 # length of the vectors
VECTOR_NUMBER = 191                 # initial number of vectors
INDEX_NAME = "idx:bf5"                 # name of the search index
PREFIX = "bugframework:"                                 # prefix for the document keys
DISTANCE_METRIC = "COSINE"    

# Create index on Redis Stack
schema = (
    TextField('$.paper', no_stem=True, as_name='paper'),
    TextField('$.text', no_stem=True, as_name='text'),
    VectorField('$.content_vector', 'FLAT', 
                {
                    "TYPE": 'FLOAT32',
                    "DIM": VECTOR_DIM,
                    "DISTANCE_METRIC": DISTANCE_METRIC,
                }, as_name='content_vector'
    )
)

try:
    redis_connection.ft(INDEX_NAME).info()
    print("Index already exists")
except:
    # Create RediSearch Index
    redis_connection.ft(INDEX_NAME).create_index(
        fields = schema,
        definition = IndexDefinition(prefix=[PREFIX], index_type=IndexType.JSON)
    )

In [25]:
def get_json_entry(text: str, paper: str):
    return {
        "text": text,
        "paper": paper,
        "content_vector": model.encode(text).astype(np.float32).tolist()
    }

In [26]:
def extract_paper_title_from_entry(entry: str)-> str:
    title = entry.split("-")[0].strip()
    return title

In [27]:
texts = list(papers_df['text'])

In [18]:
entries = []
for text in texts:
    paper_title = extract_paper_title_from_entry(text)
    entries.append(get_json_entry(text, paper_title))

In [28]:
counter=0
for i, entry in enumerate(entries):
    redis_key = f"bugframework:{i:04}"
    redis_connection.json().set(redis_key, Path.root_path(), entry)
    counter = counter+1

In [46]:
question_vector = model.encode("diffirences between the Bug Framework and CWE").astype(np.float32).tobytes()
query = f"*=>[KNN 10 @content_vector $vec as score]"  # Adjust KNN value based on your needs
params = {'vec': question_vector}

# Execute the query
results = redis_connection.ft("idx:bf5").search(Query(query).sort_by('score').return_fields('text', 'score').dialect(2), query_params=params)

# Print the results
for doc in results.docs:
    print(f"Paper Title: {doc.text}, Score: {doc.score}")
    print("\n")

Paper Title: Input/Output Check Bugs Taxonomy: Injection Errors in Spotlight- VIII. CONCLUSIONThe CWE digraphs by BF class consequences should bedeeply analyzed. Generation of digraphs with CWEs relatedto particular software errors (e.g., injection errors), detectingcorresponding clusters, and understanding their relationshipswould create a comprehensive view of the CWE modelfor researchers and practitioners. In turn, comparing andcontrasting the CWE's exhaustive list of weaknesses with allthe possible consequence-cause transitions to other BF classeswould improve BF as a tool for describing CVEs., Score: 0.411394834518


Paper Title: Classifying Memory Bugs Using Bugs Framework Approach- V. THE BF MEMORY CLASSES AS CWE EXTENSIONThe BF Memory Bugs model reflects the lifecycle of an object. While the pillar CWE-664 reflects the"lifetime of creation, use, and release" of a resource, it isquite broad. It is the parent of many CWEs that are not strictlymemory-related. We use asterisks (*) 