In [17]:
from langchain_community.document_loaders import DirectoryLoader, JSONLoader, TextLoader, UnstructuredFileLoader, UnstructuredHTMLLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
import os
import subprocess

In [18]:
def setup_ollama():
        """
        Downloads (if necessary) and runs ollama locally
        """
        # os.system("curl -fsSL https://ollama.com/install.sh | sh")
        # os.system("export OLLAMA_HOST=localhost:8888")
        os.system("sudo service ollama stop")
        cmd = "ollama serve"
        with open(os.devnull, 'wb') as devnull:
                process = subprocess.Popen(cmd, shell=True, stdout=devnull, stderr=devnull)

In [19]:
def get_file_types(directory):
        """
        Traverses all of the files in specified directory and returns types of files that it finds

        Args:
            directory (str): Path to directory

        Returns:
            Set[str]: All of the file types that can be found in the directory
        """
        file_types = set()

        for filename in os.listdir(directory):
                if os.path.isfile(os.path.join(directory, filename)):
                        _, ext = os.path.splitext(filename)
                        file_types.add(ext)
        return file_types

In [20]:
# Specified loader for each type of file found in the cyber data directory (so far)
loaders = {
    '.php': UnstructuredFileLoader,
    '.cs': UnstructuredFileLoader,
    '': UnstructuredFileLoader,
    '.c': UnstructuredFileLoader,
    '.html': UnstructuredHTMLLoader,
    '.md': UnstructuredMarkdownLoader,
    '.tzt': UnstructuredFileLoader,
    '.java': UnstructuredFileLoader,
    '.txt': TextLoader,
    '.ps1': UnstructuredFileLoader,
    '.delphi': UnstructuredFileLoader,
    '.asm': UnstructuredFileLoader,
    '.TXT': TextLoader,
    '.json': JSONLoader
}

In [21]:
def create_directory_loader(file_type, directory_path):
        """
        Creates and returns a DirectoryLoader using the loader specific to the file type provided
        
        Args:
            file_type (str): Type of file to make loader for
            directory_path (str): Path to directory

        Returns:
            DirectoryLoader: loader for the files in the directory provided
        """
        if file_type == '.json':
            loader_list = []
            for file_name in [file for file in os.listdir(directory_path) if file.endswith('.json')]:
                loader_list.append(JSONLoader(file_path=directory_path+'/'+file_name,jq_schema='.', text_content=False))
            return loader_list
        else:
            return DirectoryLoader(
            path=directory_path,
            glob=f"**/*{file_type}",
            loader_cls=loaders.get(file_type, UnstructuredFileLoader))

In [22]:
def split_text(docs, chunk_size=512, chunk_overlap=50):
        """
        Splits the given text into chunks of a specified maximum length using RecursiveCharacterTextSplitter.
        
        Parameters:
                text (str): The input text to be split.
                max_length (int): The maximum length of each chunk.
                chunk_overlap (int): The number of characters to overlap between chunks.
                
        Returns:
                List[str]: A list of text chunks.
        """
        splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
        )
        return splitter.split_documents(docs)

In [25]:
if __name__=="__main__":
        setup_ollama()
                
        DB_FAISS_PATH = '../vectorstore'
        DATA_PATH = '../../unprocessed_cyber_data'
        
        file_types = get_file_types(DATA_PATH)
        documents = []
        
        for file_type in file_types:
                if file_type.strip() != "":
                        if file_type == '.json':
                                loader_list = create_directory_loader(file_type, DATA_PATH)
                                for loader in loader_list:
                                        docs = loader.load()
                                        chunks = split_text(docs)
                                        if chunks != None and chunks != "" and len(chunks) > 0:
                                                documents.extend(chunks)
                        else:        
                                loader = create_directory_loader(file_type, DATA_PATH)
                                docs = loader.load()
                                chunks = split_text(docs)
                                if chunks != None and chunks != "" and len(chunks) > 0:
                                        documents.extend(chunks)
                                        
        for document in documents:
                document.page_content += ' Source: ' + document.metadata['source'].replace('/', '.').split('.')[-2]
        
        print(documents)
        # os.system("ollama pull mxbai-embed-large")
        # embeddings=OllamaEmbeddings(model="mxbai-embed-large", show_progress=True)
        # vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)
        # vectorstore.save_local(DB_FAISS_PATH)

FileNotFoundError: [Errno 2] No such file or directory: '../../unprocessed_cyber_data'

In [24]:
import os
import glob
import re
import pathlib
# Directory where files are located
directory = '../../cyber_data'
# Get a list of all files in the directory
for filename in os.listdir(directory):  # Only get .txt files
    if filename.endswith('.txt'):
        print(filename)
# for file_path in files:
#     # Extract filename from the path
#     file_name = os.path.basename(file_path)
#     # Check if the file starts with a number and ends with .txt
#     if re.match(r'^\d.*\.txt$', file_name):
#         # Open each file and read the first line
#         with open(file_path, 'r', encoding='utf-8') as file:
#             first_line = file.readline().strip()
#         # Extract file extension
#         file_ext = os.path.splitext(file_name)[1]
#         # Create the new file name using the first line content
#         new_file_name = os.path.join(directory, first_line + file_ext)
#         # Rename the file
#         try:
#             os.rename(file_path, new_file_name)
#             print(f'Renamed {file_name} to {first_line + file_ext}')
#         except FileNotFoundError:
#             print(f"FileNotFoundError: {file_path} not found.")
#     else:
#         print(f"Skipping {file_name} - does not match renaming criteria.")

FileNotFoundError: [Errno 2] No such file or directory: '../../cyber_data'

In [None]:
documents

[Document(metadata={'source': '../../cyber_data/51749.TXT'}, page_content='## Title: Equipment Rental Script-1.0 - SQLi\n## Author: nu11secur1ty\n## Date: 09/12/2023\n## Vendor: https://www.phpjabbers.com/\n## Software: https://www.phpjabbers.com/equipment-rental-script/#sectionDemo\n## Reference: https://portswigger.net/web-security/sql-injection Source: 51749'),
 Document(metadata={'source': '../../cyber_data/51749.TXT'}, page_content="## Description:\nThe package_id parameter appears to be vulnerable to SQL injection\nattacks. The payload ' was submitted in the package_id parameter, and\na database error message was returned. You should review the contents\nof the error message, and the application's handling of other input,\nto confirm whether a vulnerability is present. The attacker can steal\nall information from the database!\n\n[+]Payload:\nmysql Source: 51749"),
 Document(metadata={'source': '../../cyber_data/51749.TXT'}, page_content='[+]Payload:\nmysql\n\nParameter: #1* ((cu