In [1]:
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from dotenv import get_key
import os
from math import ceil

In [2]:
# file_path = (
#     "./files/first_chapter.pdf"
# )
file_path = (
    "./files/2024-dbir-data-breach-investigations-report.pdf"
)
file_name = '2024-dbir-data-breach-investigations-report.pdf'
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()

In [3]:
os.environ['GOOGLE_API_KEY'] = get_key('.env', 'GEMINI_API_KEY')

In [4]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=['.'])
docs = text_splitter.split_documents(pages)

In [6]:
db_local = None
 
for i in range(0, len(docs), 100):
    if i + 100 < len(docs):
        db = FAISS.from_documents(docs[i:i+100], embeddings)
    else: 
        db = FAISS.from_documents(docs[i:len(docs)], embeddings)
    if not db_local:
        db_local = db
    else: 
        db_local.merge_from(db) 
db_local.save_local(f'./faiss_db/{file_name}')

In [7]:
db = FAISS.load_local(f"./faiss_db/{file_name}", embeddings, allow_dangerous_deserialization=True)

In [8]:
db.index.ntotal

287

In [9]:
summarization_percentage = 0.5

In [10]:
k = summarization_percentage * db.index.ntotal
k = ceil(k)
k

144

In [16]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
# retriever = db.as_retriever(search_type="similarity")

In [26]:
retrieved_docs = retriever.invoke("threats")

In [27]:
text = []
for doc in retrieved_docs:
    text.append(doc.page_content)

text

['18\nAction categories28\nHacking (hak):  attempts to \nintentionally access or harm \ninformation assets without (or \nexceeding) authorization by \ncircumventing or thwarting \nlogical security mechanisms.\nMalware (mal):  any malicious \nsoftware, script or code run on \na device that alters its state or \nfunction without the owner’s \ninformed consent.\nError (err): anything done  \n(or left undone) incorrectly  \nor inadvertently.\nSocial (soc):  employ deception, \nmanipulation, intimidation, etc., \nto exploit the human element, or \nusers, of information assets.\nMisuse (mis):  use of entrusted \norganizational resources or \nprivileges for any purpose or \nmanner contrary to that which \nwas intended.\nPhysical (phy): deliberate threats \nthat involve proximity, possession \nor force.\nEnvironmental (env): not only \nincludes natural events such \nas earthquakes and floods but \nalso hazards associated with \nthe immediate environment or \ninfrastructure in which assets  \na

In [28]:
len(text)

144

In [29]:
api_key = get_key('.env', 'GEMINI_API_KEY')

In [30]:
genai.configure(api_key=api_key)

In [31]:
llm = genai.GenerativeModel(
    model_name='gemini-pro'
)

In [32]:
response = llm.generate_content(f'Convert the following text chunks into a natural and human understandable paragraph \n\n {text}')
print(response.text)

**Executive Summary**

**Key Findings**

* Ways-in analysis showed a substantial increase in attacks involving the exploitation of vulnerabilities as the critical path to initiate a breach.

* Roughly a third of all breaches involved Ransomware or Extortion techniques. Pure Extortion attacks have increased over the past year and now constitute 9% of all breaches.

* Social engineering remains a prevalent attack method, with Pretexting surpassing Phishing as the most common social action.

**Incident Classification Patterns**

* **System Intrusion:** The most frequent breach pattern, with Ransomware and Extortion continuing to drive its growth.

* **Social Engineering:** Social attacks, especially Pretexting and Phishing, continue to be successful, often leading to Ransomware deployment.

* **Basic Web Application Attacks:** Web applications remain a target for credential theft and other attacks due to vulnerabilities and poor security practices.

* **Miscellaneous Errors:** A significa