In [23]:
from langchain_community.document_loaders import DirectoryLoader, JSONLoader, TextLoader, UnstructuredFileLoader, UnstructuredHTMLLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
import os
import pathlib
import subprocess

In [24]:
def setup_ollama():
        """
        Downloads (if necessary) and runs ollama locally
        """
        # os.system("curl -fsSL https://ollama.com/install.sh | sh")
        # os.system("export OLLAMA_HOST=localhost:8888")
        os.system("sudo service ollama stop")
        cmd = "ollama serve"
        with open(os.devnull, 'wb') as devnull:
                process = subprocess.Popen(cmd, shell=True, stdout=devnull, stderr=devnull)

In [25]:
def txt_file_rename(directory):
    """
    Takes .txt files and renames them if they have a line containing title in them

    Args:
        directory (str): path to directory where files are stored
    """
    file_paths = pathlib.Path(directory).glob('*.txt')
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        file_ext = os.path.splitext(file_name)[1]
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                segments = line.split(':')
                if 'title' in segments[0].lower() and len(segments) >= 2:
                    name = segments[1].strip()
                    new_file_name = os.path.join(directory, name + file_ext)
                    try:
                        os.rename(file_path, new_file_name)
                        # print(f'Renamed {file_name} to {name}')
                    except FileNotFoundError:
                        print(f"FileNotFoundError: {file_path} not found.")
                    except PermissionError:
                        print("Permission denied: You don't have the necessary permissions to change the permissions of this file.")
                    except NotADirectoryError:
                        print(f"Not a directory: {new_file_name}")

In [26]:
def get_file_types(directory):
        """
        Traverses all of the files in specified directory and returns types of files that it finds

        Args:
            directory (str): Path to directory

        Returns:
            Set[str]: All of the file types that can be found in the directory
        """
        file_types = set()

        for filename in os.listdir(directory):
                if os.path.isfile(os.path.join(directory, filename)):
                        _, ext = os.path.splitext(filename)
                        file_types.add(ext)
        return file_types

In [27]:
# Specified loader for each type of file found in the cyber data directory (so far)
loaders = {
    '.php': UnstructuredFileLoader,
    '.cs': UnstructuredFileLoader,
    '': UnstructuredFileLoader,
    '.c': UnstructuredFileLoader,
    '.html': UnstructuredHTMLLoader,
    '.md': UnstructuredMarkdownLoader,
    '.tzt': UnstructuredFileLoader,
    '.java': UnstructuredFileLoader,
    '.txt': TextLoader,
    '.ps1': UnstructuredFileLoader,
    '.delphi': UnstructuredFileLoader,
    '.asm': UnstructuredFileLoader,
    '.TXT': TextLoader,
    '.json': JSONLoader
}

In [28]:
def create_directory_loader(file_type, directory_path):
        """
        Creates and returns a DirectoryLoader using the loader specific to the file type provided
        
        Args:
            file_type (str): Type of file to make loader for
            directory_path (str): Path to directory

        Returns:
            DirectoryLoader: loader for the files in the directory provided
        """
        if file_type == '.json':
            loader_list = []
            for file_name in [file for file in os.listdir(directory_path) if file.endswith('.json')]:
                loader_list.append(JSONLoader(file_path=directory_path+'/'+file_name,jq_schema='.', text_content=False))
            return loader_list
        else:
            return DirectoryLoader(
            path=directory_path,
            glob=f"**/*{file_type}",
            loader_cls=loaders.get(file_type, UnstructuredFileLoader))

In [29]:
def split_text(docs, chunk_size=512, chunk_overlap=64):
        """
        Splits the given text into chunks of a specified maximum length using RecursiveCharacterTextSplitter.
        
        Parameters:
                text (str): The input text to be split.
                max_length (int): The maximum length of each chunk.
                chunk_overlap (int): The number of characters to overlap between chunks.
                
        Returns:
                List[str]: A list of text chunks.
        """
        splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
        )
        return splitter.split_documents(docs)

In [30]:
def load_documents(directory):
        """
        Loads in files from ../data directory and returns them
        
        Parameters:
                directory (str): The input text to be split.
        
        Returns:
                List[Document]: Array of documents
        """
        txt_file_rename(directory)
        file_types = get_file_types(directory)
        documents = []
        
        for file_type in file_types:
                if file_type.strip() != "":
                        if file_type == '.json':
                                loader_list = create_directory_loader(file_type, directory)
                                for loader in loader_list:
                                        docs = loader.load()
                                        chunks = split_text(docs)
                                        if chunks != None and chunks != "" and len(chunks) > 0:
                                                documents.extend(chunks)
                        else:        
                                loader = create_directory_loader(file_type, directory)
                                docs = loader.load()
                                chunks = split_text(docs)
                                if chunks != None and chunks != "" and len(chunks) > 0:
                                        documents.extend(chunks)
        return documents

In [31]:
def create_knowledgeBase(directory, vectorstore):
    """
    Loads in documents, splits into chunks, and vectorizes chunks and stores vectors under FAISS vector store
    
    Parameters:
        directory (str): The input text to be split.
        vectorstore (FAISS):
    """
    documents = load_documents(directory)
    os.system("ollama pull mxbai-embed-large")
    embeddings=OllamaEmbeddings(model="mxbai-embed-large", show_progress=True)
    vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)
    if os.path.exists(DB_FAISS_PATH + '/index.faiss'):
        old_vectorstore = FAISS.load_local(DB_FAISS_PATH, embeddings)
        old_vectorstore.merge_from(DB_FAISS_PATH)
        old_vectorstore.save_local(DB_FAISS_PATH)
    else:
        vectorstore.save_local(DB_FAISS_PATH)

In [32]:
def move_files(directory):
    """
    Moves files from unprocessed data directory to processed data directory
    
    Parameters:
        directory (str): The input text to be split.
    """
    file_paths = pathlib.Path(directory)
    new_path = '../../processed_cyber_data'
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        file_ext = os.path.splitext(file_name)[1]
        new_path += file_name + file_ext
        os.replace(file_path, new_path+file_name+file_ext)

In [33]:
if __name__=="__main__":
        setup_ollama()
        DB_FAISS_PATH = '../vectorstore'
        DATA_PATH = '../../unprocessed_cyber_data'
        # create_knowledgeBase(DATA_PATH, DB_FAISS_PATH)
        move_files(DATA_PATH)

Renamed BWL Advanced FAQ Manager 2.0.3 - Authenticated SQL Injection.txt to BWL Advanced FAQ Manager 2.0.3 - Authenticated SQL Injection
Renamed Blackcat Cms v1.4 - Remote Code Execution (RCE).txt to Blackcat Cms v1.4 - Remote Code Execution (RCE)
Renamed Jedox 2020.2.5 - Remote Code Execution via Executable Groovy-Scripts.txt to Jedox 2020.2.5 - Remote Code Execution via Executable Groovy-Scripts
Renamed PHPJabbers Night Club Booking 1.0 - Reflected XSS.txt to PHPJabbers Night Club Booking 1.0 - Reflected XSS
Renamed Joomla Solidres 2.13.3 - Reflected XSS.txt to Joomla Solidres 2.13.3 - Reflected XSS
Renamed Stored XSS in Microweber.txt to Stored XSS in Microweber
Renamed Microsoft Office 365 Version 18.2305.1222.0 - Elevation of Privilege + RCE..txt to Microsoft Office 365 Version 18.2305.1222.0 - Elevation of Privilege + RCE.
Renamed Wordpress Sonaar Music Plugin 4.7 - Stored XSS.txt to Wordpress Sonaar Music Plugin 4.7 - Stored XSS
Renamed Wordpress Sonaar Music Plugin 4.7 - Stored

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest 
pulling 819c2adf5ce6... 100% ▕████████████████▏ 669 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling b837481ff855... 100% ▕████████████████▏   16 B                         
pulling 38badd946f91... 100% ▕████████████████▏  408 B                         
verifying sha256 digest 
writing manifest 
removing any unused layers 
success [?25h
OllamaEmbeddings: 100%|██████████| 20204/20204 [33:21<00:00, 10.09it/s]
