In [1]:
!pip install langchain-community pinecone-client langchain-huggingface pinecone-text python-dotenv pinecone-client pinecone-text pinecone-notebooks



In [14]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone_text.sparse import BM25Encoder
from dotenv import load_dotenv
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
import re

load_dotenv()


api_key = os.getenv("PINECONE_API_KEY")
environment = os.getenv("ENVIRONMENT")
index_name = os.getenv("INDEX_NAME")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
log_file_path = "secrcomp.log"


def preprocess_logs(file_path):
    with open(file_path, "r") as file:
        log_lines = file.readlines()

    log_pattern = r'(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] "(.*?) (.*?) HTTP.*" (\d{3}) (\d+) "(.*?)" "(.*?)" (\d+)'

    parsed_logs = []
    for line in log_lines:
        match = re.match(log_pattern, line)
        if match:
            parsed_logs.append(match.groups())

    df = pd.DataFrame(parsed_logs, columns=[
        "ip", "timestamp", "method", "endpoint", 
        "status_code", "response_size", "referer", 
        "user_agent", "response_time"
    ]).drop(columns=['referer'])

    # Cümleleri oluşturma
    sentences = df.apply(
        lambda row: f"Request from IP {row['ip']} on {row['timestamp']} using {row['method']} method to endpoint {row['endpoint']} resulted in status code {row['status_code']} with a response size of {row['response_size']} bytes, response time of {row['response_time']} ms, and user agent '{row['user_agent']}'.",
        axis=1
    ).tolist()

    return sentences

sentences = preprocess_logs(log_file_path)
# sentences = "\n".join(sentences)

In [15]:
pc = Pinecone(api_key=api_key)
print(index_name)
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="dotproduct",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    print(f"Yeni index: {index_name} oluşturuldu.")

index = pc.Index(index_name)

secrcomp_log


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': '536bf71f4244aa6f9c2db9238fc65d06', 'Date': 'Sun, 18 Aug 2024 16:48:21 GMT', 'Server': 'Google Frontend', 'Content-Length': '125', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Name must consist of lower case alphanumeric characters or '-'"},"status":400}


In [None]:
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

# BM25 Encoder oluşturma ve eğitmek
bm25_encoder = BM25Encoder().default()
bm25_encoder.fit(sentences)
bm25_encoder.dump("bm25_values.json")
bm25_encoder = BM25Encoder().load("bm25_values.json")



In [None]:

# Hibrit arama retriever'ı
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index
)

# Log verilerini eklemek
retriever.add_texts(sentences)

In [None]:

def generate_answer(text):
    # Soru ve metni birleştirme
    input_text = f"Question: {query} Context: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Yanıt oluşturma
    output_ids = model.generate(input_ids, max_length=150, num_beams=1, early_stopping=True)
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

# T5 modeli yükleme
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# query = " How many IP addresses were sent POST requests to the endpoint ‘/usr/admin’?"
query = "At what time did IP address 203.119.43.90 send a POST request to endpoint /usr/admin ?"

retrieved_texts = retriever.invoke(query)

for doc in retrieved_texts:
    text = doc.page_content
    answer = generate_answer(text)
    print(f"Text: {text}")
    print(f"Soru: {query}\nCevap: {answer}\n")
