In [97]:
from langchain_community.document_loaders import DirectoryLoader, JSONLoader, PyPDFLoader, TextLoader, UnstructuredFileLoader, UnstructuredHTMLLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_transformers import DoctranPropertyExtractor
import os
import pathlib
import subprocess

In [98]:
def setup_ollama():
        """
        Downloads (if necessary) and runs ollama locally
        """
        # os.system("curl -fsSL https://ollama.com/install.sh | sh")
        # os.system("export OLLAMA_HOST=localhost:8888")
        os.system("sudo service ollama stop")
        cmd = "ollama serve"
        with open(os.devnull, 'wb') as devnull:
                process = subprocess.Popen(cmd, shell=True, stdout=devnull, stderr=devnull)

In [99]:
def txt_file_rename(directory):
    """
    Takes .txt files and renames them if they have a line containing title in them

    Args:
        directory (str): path to directory where files are stored
    """
    file_paths = pathlib.Path(directory).glob('*.txt')
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        file_ext = os.path.splitext(file_name)[1]
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                segments = line.split(':')
                if 'title' in segments[0].lower() and len(segments) >= 2:
                    name = segments[1].strip()
                    new_file_name = os.path.join(directory, name + file_ext)
                    try:
                        os.rename(file_path, new_file_name)
                        # print(f'Renamed {file_name} to {name}')
                    except FileNotFoundError:
                        print("", end='')
                        # print(f"FileNotFoundError: {file_path} not found.")
                    except PermissionError:
                        print("", end='')
                        # print("Permission denied: You don't have the necessary permissions to change the permissions of this file.")
                    except NotADirectoryError:
                        print("", end='')
                        # print(f"Not a directory: {new_file_name}")

In [100]:
def get_file_types(directory):
        """
        Traverses all of the files in specified directory and returns types of files that it finds

        Args:
            directory (str): Path to directory

        Returns:
            Set[str]: All of the file types that can be found in the directory
        """
        file_types = set()

        for filename in os.listdir(directory):
                if os.path.isfile(os.path.join(directory, filename)):
                        _, ext = os.path.splitext(filename)
                        file_types.add(ext)
        return file_types

In [101]:
# Specified loader for each type of file found in the cyber data directory (so far)
loaders = {
    '.php': UnstructuredFileLoader,
    '.cs': UnstructuredFileLoader,
    '': UnstructuredFileLoader,
    '.c': UnstructuredFileLoader,
    '.html': UnstructuredHTMLLoader,
    '.md': UnstructuredMarkdownLoader,
    '.tzt': UnstructuredFileLoader,
    '.java': UnstructuredFileLoader,
    '.txt': TextLoader,
    '.ps1': UnstructuredFileLoader,
    '.delphi': UnstructuredFileLoader,
    '.asm': UnstructuredFileLoader,
    '.TXT': TextLoader,
    '.json': JSONLoader,
    '.pdf': PyPDFLoader
}

In [102]:
def create_directory_loader(file_type, directory_path):
        """
        Creates and returns a DirectoryLoader using the loader specific to the file type provided
        
        Args:
            file_type (str): Type of file to make loader for
            directory_path (str): Path to directory

        Returns:
            DirectoryLoader: loader for the files in the directory provided
        """
        if file_type == '.json':
            loader_list = []
            for file_name in [file for file in os.listdir(directory_path) if file.endswith('.json')]:
                loader_list.append(JSONLoader(file_path=directory_path+'/'+file_name,jq_schema='.', text_content=False))
            return loader_list
        else:
            return DirectoryLoader(
            path=directory_path,
            glob=f"**/*{file_type}",
            loader_cls=loaders.get(file_type, UnstructuredFileLoader))

In [103]:
def split_text(docs, chunk_size=512, chunk_overlap=64):
        """
        Splits the given text into chunks of a specified maximum length using RecursiveCharacterTextSplitter.
        
        Parameters:
                text (str): The input text to be split.
                max_length (int): The maximum length of each chunk.
                chunk_overlap (int): The number of characters to overlap between chunks.
                
        Returns:
                List[str]: A list of text chunks.
        """
        splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
        )
        return splitter.split_documents(docs)

In [104]:
# def metadata_extractor(documents):
#     properties = [
#     {
#         "name": "category",
#         "description": "What type of document this is.",
#         "type": "string",
#         "enum": ["code_block", "instructions", "explanation"],
#         "required": True,
#     },
#     {
#         "name": "malware",
#         "description": "A list of all malware mentioned in this document.",
#         "type": "array",
#         "items": {
#             "name": "computer_malware",
#             "description": "The full name of the malware used",
#             "type": "string",
#         },
#         "required": True,
#     },
#     {
#         "name": "eli5",
#         "description": "Explain this email to me like I'm 5 years old.",
#         "type": "string",
#         "required": True,
#     },
# ]
    
#     property_extractor = DoctranPropertyExtractor(properties=properties)
#     extracted_document = property_extractor.transform_documents(documents, properties=properties)
#     return extracted_document

In [105]:
def chunk_numberer(docs):
    num = 1
    source = docs[0].metadata['source']
    for doc in docs:
        if source != doc.metadata['source']:
            num = 1
            source = doc.metadata['source']
        doc.metadata['chunk_no'] = num
        num += 1
    return docs

In [106]:
def document_id(docs):
    for doc in docs:
        source = os.path.basename(doc.metadata['source'])
        chunk_no = doc.metadata['chunk_no']
        doc.metadata['id'] = f"{source}-{chunk_no}"
    return docs

In [107]:
def load_documents(directory):
        """
        Loads in files from ../data directory and returns them
        
        Parameters:
                directory (str): The input text to be split.
        
        Returns:
                List[Document]: Array of documents
        """
        file_types = get_file_types(directory)
        documents = []
        
        for file_type in file_types:
                if file_type.strip() != "":
                        if file_type == '.json':
                                loader_list = create_directory_loader(file_type, directory)
                                for loader in loader_list:
                                        docs = loader.load()
                                        chunks = split_text(docs)
                                        if chunks != None and chunks != "" and len(chunks) > 0:
                                                documents.extend(chunks)
                        else:        
                                loader = create_directory_loader(file_type, directory)
                                docs = loader.load()
                                chunks = split_text(docs)
                                if chunks != None and chunks != "" and len(chunks) > 0:
                                        documents.extend(chunks)
        
        documents = chunk_numberer(documents)
        return document_id(documents)

In [108]:
def get_file_names(directory):
    file_names = []
    for file_name in os.listdir(directory):
        file_names.append(file_name)
    return file_names

In [109]:
def delete_IDs(directory, vectorstore):
    IDs = []
    file_names = get_file_names(directory)
    for key, val in vectorstore.docstore._dict.items():
        for file_name in file_names:
            if file_name in val.metadata['id'].split('-')[0]:
                IDs.append(key)
    vectorstore.delete(IDs)
    return vectorstore

In [110]:
def create_knowledgeBase(directory, vector_path):
    """
    Loads in documents, splits into chunks, and vectorizes chunks and stores vectors under FAISS vector store
    
    Parameters:
        directory (str): The input text to be split.
        vectorstore (FAISS): vector store containing vectors of documents
    """
    documents = load_documents(directory)
    os.system("ollama pull mxbai-embed-large")
    embeddings=OllamaEmbeddings(model="mxbai-embed-large", show_progress=True)
    if len(documents) > 0:
        if os.path.exists(vector_path + '/index.faiss'):
            vectorstore = FAISS.load_local(vector_path, embeddings, allow_dangerous_deserialization=True)
            vectorstore = delete_IDs(directory, vectorstore)
            vectorstore.add_documents(documents)
            vectorstore.save_local(vector_path)
            return vectorstore
        else:
            vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)
            vectorstore.save_local(vector_path)
            return vectorstore

In [111]:
def move_files(directory):
    """
    Moves files from unprocessed data directory to processed data directory
    
    Parameters:
        directory (str): The input text to be split.
    """
    file_paths = pathlib.Path(directory).iterdir()
    for file_path in file_paths:
        new_path = '../../processed_cyber_data/'
        file_name = os.path.basename(file_path)
        new_path += file_name
        os.replace(file_path, new_path)

In [114]:
if __name__=="__main__":
        setup_ollama()
        DB_FAISS_PATH = '../test_vectorstore'
        DATA_PATH = '../test'
        txt_file_rename(DATA_PATH)
        vectorstore = create_knowledgeBase(DATA_PATH, DB_FAISS_PATH)

[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest 
pulling 819c2adf5ce6... 100% ▕████████████████▏ 669 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling b837481ff855... 100% ▕████████████████▏   16 B                         
pulling 38badd946f91... 100% ▕████████████████▏  408 B                         
verifying sha256 digest 
writing manifest 
removing any unused layers 
success [?25h
OllamaEmbeddings: 100%|██████████| 587/587 [00:56<00:00, 10.46it/s]


In [113]:
print(vectorstore.index_to_docstore_id)

{0: 'f7c58d51-df10-41b9-8493-308958bb7486', 1: 'a165640e-82aa-4e7a-a789-f7274bd71087', 2: 'f6ab3fa9-f5a8-40ee-8476-72a47e68ccc0', 3: '29c5c343-e7ac-4e7a-bde0-109249145966', 4: 'a815e8d8-2655-44b5-8fc0-499bb66f4e5f', 5: '9ea0fef8-bf67-44b0-b664-16a5bec4c884', 6: 'a0305693-7bfb-4590-84fd-9ebf783035bb', 7: '0b08b740-0d82-4bd1-9b85-450ba0780bfb', 8: 'b146fbe5-3440-44aa-8920-1eae13006249', 9: '4dcddff2-7105-4419-8c77-4ef64f3d95a5', 10: 'b5936987-7350-4a7f-ab14-67e391e54ef7', 11: 'dd79cb45-1906-4cf0-9056-3eb13543a678', 12: '7f17eac6-37d2-440d-bea1-46e3441c0d9c', 13: '4052f4b2-a56f-4b6a-8447-cbf23c35048d', 14: '74c603ab-a097-4b39-b9da-5adab0bdcee8', 15: '295f27fc-d7ac-404f-a4fd-da93a37f8441', 16: '79144bd3-c7ce-4fe6-a89f-b45d514667ea', 17: 'd777677f-e5cc-4863-a40c-d0ad97787c4b', 18: '648d9213-d421-4776-9b6c-ff7dde6b3160', 19: 'e2554970-77e2-42b1-8b46-8c1bbb13246d', 20: 'd40045c6-e49a-49ca-aed7-5a4ad89fc570', 21: '22dbb9fe-12de-4338-91f8-57a037568c8a', 22: 'b83242eb-970e-4e6d-a8bd-1a7daac6af2d

In [115]:
print(vectorstore.index_to_docstore_id)

AttributeError: 'NoneType' object has no attribute 'index_to_docstore_id'

In [None]:
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings.fake import FakeEmbeddings
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document
import faiss
# embedding_size = 1536
# index = faiss.IndexFlatL2(embedding_size)
# embedding_fn = FakeEmbeddings(size=embedding_size).embed_query
docstore = InMemoryDocstore({})
# vectorstore = FAISS(embedding_fn, index, docstore, {})
documents = load_documents(DATA_PATH)
# vectorstore.add_documents(documents)
index_to_docstore_id = vectorstore.index_to_docstore_id
for i in range(len(documents)):
    print(docstore._dict[index_to_docstore_id[i]])

In [None]:
def document_id(docs):
    for doc in docs:
        source = os.path.basename(doc.metadata['source'])
        chunk_no = doc.metadata['chunk_no']
        doc.metadata['id'] = f"{source}-{chunk_no}"
    return docs

documents = document_id(documents)
print(documents)

In [None]:
# from langchain_community.llms import Ollama
# from ragas.testset.generator import TestsetGenerator
# from ragas.testset.evolutions import simple, reasoning, multi_context

# os.system("ollama pull llama3")
# os.system("ollama pull jimscard/whiterabbit-neo")
# os.system("ollama pull mxbai-embed-large")
# generator_llm = Ollama(model="llama3")
# critic_llm = Ollama(model="jimscard/whiterabbit-neo")
# embeddings=OllamaEmbeddings(model="mxbai-embed-large", show_progress=True)

# generator = TestsetGenerator.from_langchain(
#     generator_llm,
#     critic_llm,
#     embeddings
# )

# DATA_PATH = '../../processed_cyber_data'
# documents = load_documents(DATA_PATH)
# testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
# print(testset)