In [1]:
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import os
import getpass
import json 
import openai
from pymongo import MongoClient
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain import PromptTemplate

In [2]:
# Initialize environment variables from .env file
_ = load_dotenv(find_dotenv()) 

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")
OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_EMBEDDING_MODEL_NAME")
MONGODB_ATLAS_CLUSTER_URI = os.getenv("MONGODB_ATLAS_CLUSTER_URI")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

In [32]:
# initialize MongoDB python client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)

db_name = "mydb"
collection_name = "vectors"
collection = client[db_name][collection_name]
index_name = "vectorSearchIndex"

In [33]:
# initialize vector store
embeddings = OpenAIEmbeddings(
    deployment=OPENAI_EMBEDDING_DEPLOYMENT_NAME, 
    model=OPENAI_EMBEDDING_MODEL_NAME, 
    openai_api_base=OPENAI_DEPLOYMENT_ENDPOINT, 
    openai_api_type="azure", 
    chunk_size=1
)
vector_store = MongoDBAtlasVectorSearch(collection, embeddings, index_name=index_name)

In [34]:
client[db_name][collection_name].index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)]},
 'vectorSearchIndex': {'v': 2,
  'key': [('vector', 'cosmosSearch')],
  'cosmosSearch': SON([('kind', 'vector-ivf'), ('numLists', 0), ('similarity', 'COS'), ('dimensions', 1536)])}}

In [35]:
query = "What are the key features of the Snake malware and how does it work?"
embedded_query = embeddings.embed_query(query)
embedded_query[:5]

[-0.0026670233346521854,
 0.008907011710107327,
 -0.005533861927688122,
 -0.013235209509730339,
 -0.028421156108379364]

In [36]:
# perform a similarity search between a query and the ingested documents
pipeline = [
    {
        "$search": {
            "cosmosSearch": {
                "vector": embedded_query, # vector to search
                "path": "vector", # path to property
                "k": 3, # number of results to return
            }
        }
    },
    {
        "$project": {
            "_id": 0,
            "vector": 0,
        }
    }
]

results = list(collection.aggregate(pipeline))
print(results)

[{'documentName': 'joint_cybersecurity_advisory_snake_malware', 'content': '   International Partnership TLP:CLEAR Page 10 of 48  |  Product ID: AA23-129A TLP:CLEAREncrypted Registry Key Data Upon execution, Snake’s WerFault.exe will attempt to decrypt an encrypted blob within the Windows registry that is typically found at HKLM:\\SOFTWARE\\Classes\\.wav\\OpenWithProgIds. The encrypted data includes the AES key, IV, and path that is used to find and decrypt the file containing Snake’s kernel driver and kernel driver loader. The registry object’s structure can be seen on the right side of the following figure. Snake uses Microsoft Windows Cryptography API: Next Generation (CNG) key store to store the AES key needed to decrypt the registry object.12  Kernel Driver and Custom Loader Snake’s installer drops the kernel driver and a custom DLL which is used to load the driver into a single AES encrypted file on disk. Typically, this file is named “comadmin.dat” and is stored in the %windows%

In [37]:
# Create an instance of the AzureChatOpenAI class using Azure OpenAI
llm = AzureChatOpenAI(
    openai_api_type="azure",
    openai_api_key=OPENAI_API_KEY,
    openai_api_base=OPENAI_DEPLOYMENT_ENDPOINT,
    deployment_name=OPENAI_DEPLOYMENT_NAME,
    model=OPENAI_MODEL_NAME,
    temperature=0,
    openai_api_version=OPENAI_DEPLOYMENT_VERSION)

In [38]:
system_instructions = f"Answer the user's question taking into consideration the following documents: {results}"

response = llm([SystemMessage(content=system_instructions), HumanMessage(content=query)])
print(response.content)

The Snake malware has several key features, including the use of encrypted registry key data, a kernel driver and custom loader, and a covert store. Upon execution, Snake's WerFault.exe attempts to decrypt an encrypted blob within the Windows registry that includes the AES key, IV, and path used to find and decrypt the file containing Snake's kernel driver and kernel driver loader. The installer drops the kernel driver and a custom DLL into a single AES encrypted file on disk, typically named "comadmin.dat" and stored in the %windows%\\system32\\Com directory. The covert store is used to hide Snake's files and is detected by encrypting each possible initial filesystem byte sequence with CAST-128 using the key obtained from the registry and searching for any file with a size that is an even multiple of 220. Snake also uses a queue file with a predictable path and filename structure, in addition to being high entropy. Memory analysis is an effective approach to detecting Snake because it