In [1]:
!pip install langchain-community pinecone-client langchain-huggingface pinecone-text python-dotenv pinecone-client pinecone-text pinecone-notebooks



In [2]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone_text.sparse import BM25Encoder
from dotenv import load_dotenv
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
from dotenv import load_dotenv
import os
import re

load_dotenv()


api_key = os.getenv("PINECONE_API_KEY")
environment = os.getenv("ENVIRONMENT")
index_name = os.getenv("INDEX_NAME")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
log_file_path = "secrcomp.log"


def preprocess_logs(file_path):
    with open(file_path, "r") as file:
        log_lines = file.readlines()

    log_pattern = r'(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] "(.*?) (.*?) HTTP.*" (\d{3}) (\d+) "(.*?)" "(.*?)" (\d+)'

    parsed_logs = []
    for line in log_lines:
        match = re.match(log_pattern, line)
        if match:
            parsed_logs.append(match.groups())

    df = pd.DataFrame(parsed_logs, columns=[
        "ip", "timestamp", "method", "endpoint", 
        "status_code", "response_size", "referer", 
        "user_agent", "response_time"
    ]).drop(columns=['referer'])

    # Cümleleri oluşturma
    sentences = df.apply(
        lambda row: f"Request from IP {row['ip']} on {row['timestamp']} using {row['method']} method to endpoint {row['endpoint']} resulted in status code {row['status_code']} with a response size of {row['response_size']} bytes, response time of {row['response_time']} ms, and user agent '{row['user_agent']}'.",
        axis=1
    ).tolist()

    return sentences

sentences = preprocess_logs(log_file_path)
# sentences = "\n".join(sentences)

  from tqdm.autonotebook import tqdm


In [3]:
pc = Pinecone(api_key=api_key)

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="dotproduct",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    print(f"Yeni index: {index_name} oluşturuldu.")

index = pc.Index(index_name)

Yeni index: secrcomp oluşturuldu.


In [4]:
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

# BM25 Encoder oluşturma ve eğitmek
bm25_encoder = BM25Encoder().default()
bm25_encoder.fit(sentences)
bm25_encoder.dump("bm25_values.json")
bm25_encoder = BM25Encoder().load("bm25_values.json")

  warn_deprecated(
100%|██████████| 50000/50000 [00:15<00:00, 3160.34it/s]


In [5]:

# Hibrit arama retriever'ı
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index
)

# Log verilerini eklemek
retriever.add_texts(sentences)

100%|██████████| 1563/1563 [12:18<00:00,  2.12it/s]


In [6]:

def generate_answer(text):
    # Soru ve metni birleştirme
    input_text = f"Question: {query} Context: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Yanıt oluşturma
    output_ids = model.generate(input_ids, max_length=150, num_beams=1, early_stopping=True)
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

# T5 modeli yükleme
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# query = " How many IP addresses were sent POST requests to the endpoint ‘/usr/admin’?"
query = "At what time did IP address 203.119.43.90 send a POST request to endpoint /usr/admin ?"

retrieved_texts = retriever.invoke(query)

for doc in retrieved_texts:
    text = doc.page_content
    answer = generate_answer(text)
    print(f"Text: {text}")
    print(f"Soru: {query}\nCevap: {answer}\n")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Text: Request from IP 203.119.43.90 on 15/Jun/2023:12:51:57 +0300 using POST method to endpoint /usr/admin resulted in status code 303 with a response size of 4990 bytes, response time of 4475 ms, and user agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36'.
Soru: At what time did IP address 203.119.43.90 send a POST request to endpoint /usr/admin ?
Cevap: 15/Jun/2023:12:51:57 +0300

Text: Request from IP 132.121.190.166 on 03/Jan/2023:11:06:24 +0300 using POST method to endpoint /usr/admin resulted in status code 303 with a response size of 5019 bytes, response time of 1946 ms, and user agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36'.
Soru: At what time did IP address 203.119.43.90 send a POST request to endpoint /usr/admin ?
Cevap: 03/Jan/2023:11:06:24 +0300

Text: Request from IP 119.108.112.218 on 17/Aug/2023:08:18:12 +0300 