In [13]:
from langchain_community.document_loaders import DirectoryLoader, JSONLoader, TextLoader, UnstructuredFileLoader, UnstructuredHTMLLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_transformers import DoctranPropertyExtractor
import os
import pathlib
import subprocess

In [14]:
def setup_ollama():
        """
        Downloads (if necessary) and runs ollama locally
        """
        # os.system("curl -fsSL https://ollama.com/install.sh | sh")
        # os.system("export OLLAMA_HOST=localhost:8888")
        os.system("sudo service ollama stop")
        cmd = "ollama serve"
        with open(os.devnull, 'wb') as devnull:
                process = subprocess.Popen(cmd, shell=True, stdout=devnull, stderr=devnull)

In [15]:
def txt_file_rename(directory):
    """
    Takes .txt files and renames them if they have a line containing title in them

    Args:
        directory (str): path to directory where files are stored
    """
    file_paths = pathlib.Path(directory).glob('*.txt')
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        file_ext = os.path.splitext(file_name)[1]
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                segments = line.split(':')
                if 'title' in segments[0].lower() and len(segments) >= 2:
                    name = segments[1].strip()
                    new_file_name = os.path.join(directory, name + file_ext)
                    try:
                        os.rename(file_path, new_file_name)
                        print(f'Renamed {file_name} to {name}')
                    except FileNotFoundError:
                        print(f"FileNotFoundError: {file_path} not found.")
                    except PermissionError:
                        print("Permission denied: You don't have the necessary permissions to change the permissions of this file.")
                    except NotADirectoryError:
                        print(f"Not a directory: {new_file_name}")

In [16]:
def get_file_types(directory):
        """
        Traverses all of the files in specified directory and returns types of files that it finds

        Args:
            directory (str): Path to directory

        Returns:
            Set[str]: All of the file types that can be found in the directory
        """
        file_types = set()

        for filename in os.listdir(directory):
                if os.path.isfile(os.path.join(directory, filename)):
                        _, ext = os.path.splitext(filename)
                        file_types.add(ext)
        return file_types

In [17]:
# Specified loader for each type of file found in the cyber data directory (so far)
loaders = {
    '.php': UnstructuredFileLoader,
    '.cs': UnstructuredFileLoader,
    '': UnstructuredFileLoader,
    '.c': UnstructuredFileLoader,
    '.html': UnstructuredHTMLLoader,
    '.md': UnstructuredMarkdownLoader,
    '.tzt': UnstructuredFileLoader,
    '.java': UnstructuredFileLoader,
    '.txt': TextLoader,
    '.ps1': UnstructuredFileLoader,
    '.delphi': UnstructuredFileLoader,
    '.asm': UnstructuredFileLoader,
    '.TXT': TextLoader,
    '.json': JSONLoader
}

In [18]:
def create_directory_loader(file_type, directory_path):
        """
        Creates and returns a DirectoryLoader using the loader specific to the file type provided
        
        Args:
            file_type (str): Type of file to make loader for
            directory_path (str): Path to directory

        Returns:
            DirectoryLoader: loader for the files in the directory provided
        """
        if file_type == '.json':
            loader_list = []
            for file_name in [file for file in os.listdir(directory_path) if file.endswith('.json')]:
                loader_list.append(JSONLoader(file_path=directory_path+'/'+file_name,jq_schema='.', text_content=False))
            return loader_list
        else:
            return DirectoryLoader(
            path=directory_path,
            glob=f"**/*{file_type}",
            loader_cls=loaders.get(file_type, UnstructuredFileLoader))

In [19]:
def split_text(docs, chunk_size=512, chunk_overlap=64):
        """
        Splits the given text into chunks of a specified maximum length using RecursiveCharacterTextSplitter.
        
        Parameters:
                text (str): The input text to be split.
                max_length (int): The maximum length of each chunk.
                chunk_overlap (int): The number of characters to overlap between chunks.
                
        Returns:
                List[str]: A list of text chunks.
        """
        splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
        )
        return splitter.split_documents(docs)

In [20]:
def metadata_extractor(documents):
    properties = [
    {
        "name": "category",
        "description": "What type of document this is.",
        "type": "string",
        "enum": ["code_block", "instructions", "explanation"],
        "required": True,
    },
    {
        "name": "malware",
        "description": "A list of all malware mentioned in this document.",
        "type": "array",
        "items": {
            "name": "computer_malware",
            "description": "The full name of the malware used",
            "type": "string",
        },
        "required": True,
    },
    {
        "name": "eli5",
        "description": "Explain this email to me like I'm 5 years old.",
        "type": "string",
        "required": True,
    },
]
    
    property_extractor = DoctranPropertyExtractor(properties=properties)
    extracted_document = property_extractor.transform_documents(documents, properties=properties)
    return extracted_document

In [21]:
def load_documents(directory):
        """
        Loads in files from ../data directory and returns them
        
        Parameters:
                directory (str): The input text to be split.
        
        Returns:
                List[Document]: Array of documents
        """
        file_types = get_file_types(directory)
        documents = []
        
        for file_type in file_types:
                if file_type.strip() != "":
                        if file_type == '.json':
                                loader_list = create_directory_loader(file_type, directory)
                                for loader in loader_list:
                                        docs = loader.load()
                                        chunks = split_text(docs)
                                        if chunks != None and chunks != "" and len(chunks) > 0:
                                                documents.extend(chunks)
                        else:        
                                loader = create_directory_loader(file_type, directory)
                                docs = loader.load()
                                chunks = split_text(docs)
                                if chunks != None and chunks != "" and len(chunks) > 0:
                                        documents.extend(chunks)
        return metadata_extractor(documents)

In [22]:
def create_knowledgeBase(directory, vectorstore):
    """
    Loads in documents, splits into chunks, and vectorizes chunks and stores vectors under FAISS vector store
    
    Parameters:
        directory (str): The input text to be split.
        vectorstore (FAISS):
    """
    documents = load_documents(directory)
    os.system("ollama pull mxbai-embed-large")
    embeddings=OllamaEmbeddings(model="mxbai-embed-large", show_progress=True)
    vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)
    if os.path.exists(DB_FAISS_PATH + '/index.faiss'):
        old_vectorstore = FAISS.load_local(DB_FAISS_PATH, embeddings)
        old_vectorstore.merge_from(DB_FAISS_PATH)
        old_vectorstore.save_local(DB_FAISS_PATH)
    else:
        vectorstore.save_local(DB_FAISS_PATH)

In [23]:
def move_files(directory):
    """
    Moves files from unprocessed data directory to processed data directory
    
    Parameters:
        directory (str): The input text to be split.
    """
    file_paths = pathlib.Path(directory).iterdir()
    for file_path in file_paths:
        new_path = '../../processed_cyber_data/'
        file_name = os.path.basename(file_path)
        new_path += file_name
        os.replace(file_path, new_path)

In [24]:
if __name__=="__main__":
        setup_ollama()
        DB_FAISS_PATH = '../vectorstore'
        DATA_PATH = '../../unprocessed_cyber_data'
        # txt_file_rename(DATA_PATH)
        # create_knowledgeBase(DATA_PATH, DB_FAISS_PATH)
        # move_files(DATA_PATH)
        docs = load_documents(DATA_PATH)
        print(docs)

ValueError: Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter.

In [None]:
# from langchain_community.llms import Ollama
# from ragas.testset.generator import TestsetGenerator
# from ragas.testset.evolutions import simple, reasoning, multi_context

# os.system("ollama pull llama3")
# os.system("ollama pull jimscard/whiterabbit-neo")
# os.system("ollama pull mxbai-embed-large")
# generator_llm = Ollama(model="llama3")
# critic_llm = Ollama(model="jimscard/whiterabbit-neo")
# embeddings=OllamaEmbeddings(model="mxbai-embed-large", show_progress=True)

# generator = TestsetGenerator.from_langchain(
#     generator_llm,
#     critic_llm,
#     embeddings
# )

# DATA_PATH = '../../processed_cyber_data'
# documents = load_documents(DATA_PATH)
# testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
# print(testset)

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest 
pulling 6a0746a1ec1a... 100% ▕████████████████▏ 4.7 GB                         
pulling 4fa551d4f938... 100% ▕████████████████▏  12 KB                         
pulling 8ab4849b038c... 100% ▕████████████████▏  254 B                         
pulling 577073ffcc6c... 100% ▕████████████████▏  110 B                         
pulling 3f8eb4da87fa... 100% ▕████████████████▏  485 B                         
verifying sha256 digest 
writing manifest 
removing any unused layers 
success [?25h
[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest 
pulling cb1350311b4e... 100% ▕████████████████▏ 9.2 GB                         
pulling 7e0503625fed... 100% ▕████████████████▏ 2.9 KB                         
pulling 22a7b312010d... 100% ▕███

embedding nodes:   0%|          | 0/40408 [00:00<?, ?it/s]


[A

[A[A


[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A



[A[A[A[A






OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]

OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it]


OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]



OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]

[A




OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]






OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]


[A[A






OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.02it/s]




OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]



OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.90s/it]

OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.85s/it]


OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.86s/it]



OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.83s/it]

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  9.72it/s]

OllamaEmbeddings: 100%|███

KeyboardInterrupt: 

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 13.06it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  9.37it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 24.03it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  5.48it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 13.55it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.87it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.42it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 35.72it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  8.67it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.26it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 12.18it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.95it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 31.59it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  9.03it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 12.92it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [