In [1]:
# Kütüphanelerin Yüklenmesi
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
import openai
import os

In [2]:
# Verinin Okunması
df = pd.read_csv('web_traffic_log.csv')
df

Unnamed: 0,IP_Address,Timestamp,HTTP_Method,Path,Status_Code,Response_Size
0,172.16.0.1,14/Aug/2024:19:11:48 +0000,PUT,/services.html,500,3058.0
1,172.16.0.1,14/Aug/2024:19:23:04 +0000,GET,/services.html,302,3856.0
2,172.16.0.2,14/Aug/2024:19:16:53 +0000,DELETE,/products.html,500,2671.0
3,192.168.1.2,14/Aug/2024:19:35:59 +0000,PUT,/contact.html,302,4207.0
4,192.168.1.1,14/Aug/2024:20:02:29 +0000,POST,/contact.html,302,937.0
...,...,...,...,...,...,...
1995,172.16.0.1,14/Aug/2024:19:34:19 +0000,PUT,/products.html,404,1677.0
1996,192.168.1.2,14/Aug/2024:19:09:32 +0000,DELETE,/contact.html,302,4802.0
1997,10.0.0.1,14/Aug/2024:20:03:20 +0000,DELETE,/about.html,404,4647.0
1998,10.0.0.1,14/Aug/2024:19:47:15 +0000,GET,/index.html,404,3788.0


In [3]:
df.info()  # Eksik verilerin tespitini ve veriler hakkında genel bilgi verir


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   IP_Address     2000 non-null   object 
 1   Timestamp      1992 non-null   object 
 2   HTTP_Method    1994 non-null   object 
 3   Path           2000 non-null   object 
 4   Status_Code    2000 non-null   int64  
 5   Response_Size  1991 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 93.9+ KB


In [4]:
df.describe()

Unnamed: 0,Status_Code,Response_Size
count,2000.0,1991.0
mean,344.615,2586.296836
std,112.549018,1371.962637
min,200.0,205.0
25%,200.0,1425.5
50%,302.0,2558.0
75%,404.0,3767.5
max,500.0,4998.0


In [5]:
df[df.isnull().any(axis=1)]


Unnamed: 0,IP_Address,Timestamp,HTTP_Method,Path,Status_Code,Response_Size
128,10.0.0.1,14/Aug/2024:19:55:53 +0000,PUT,/products.html,500,
219,172.16.0.1,14/Aug/2024:19:59:21 +0000,DELETE,/products.html,302,
231,10.0.0.2,,GET,/about.html,404,2326.0
297,192.168.1.1,14/Aug/2024:19:15:04 +0000,PUT,/services.html,404,
438,10.0.0.1,,GET,/services.html,200,390.0
460,192.168.1.1,,PUT,/contact.html,302,4354.0
472,10.0.0.2,,GET,/about.html,200,2286.0
473,10.0.0.1,14/Aug/2024:19:33:26 +0000,,/products.html,302,663.0
645,172.16.0.1,14/Aug/2024:19:32:38 +0000,,/products.html,500,
784,10.0.0.2,14/Aug/2024:19:22:05 +0000,PUT,/services.html,404,


In [6]:
df = df.dropna(subset=['Timestamp'])
df = df.dropna(subset=['HTTP_Method'])

In [7]:
df['Response_Size'] = df['Response_Size'].fillna(df['Response_Size'].mean())

In [8]:
df.info()  # Veri setimizde artık NaN değerler bulunmamakta

<class 'pandas.core.frame.DataFrame'>
Index: 1986 entries, 0 to 1999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   IP_Address     1986 non-null   object 
 1   Timestamp      1986 non-null   object 
 2   HTTP_Method    1986 non-null   object 
 3   Path           1986 non-null   object 
 4   Status_Code    1986 non-null   int64  
 5   Response_Size  1986 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 108.6+ KB


In [9]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%b/%Y:%H:%M:%S %z')
df['Timestamp_unix'] = df['Timestamp'].view('int64') / 10**9  # Unix zamanı saniye cinsinden

In [10]:
df = pd.get_dummies(df, columns=['HTTP_Method'])

In [11]:
label_encoder = LabelEncoder()
df['Path_encoded'] = label_encoder.fit_transform(df['Path'])
df['IP_Address_encoded'] = label_encoder.fit_transform(df['IP_Address'])

In [12]:
# TF-IDF Vectorizer oluşturma ve fit etme
text_data = df['Path'].astype(str)  # Path sütununu metin verisi olarak kullan
vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(text_data).toarray().astype(np.float32)

In [13]:
additional_features = df[['IP_Address_encoded', 'Timestamp_unix', 'Path_encoded', 'Status_Code', 'Response_Size',
                          'HTTP_Method_DELETE', 'HTTP_Method_GET', 'HTTP_Method_POST', 'HTTP_Method_PUT']].values.astype(np.float32)

In [14]:
additional_features.shape

(1986, 9)

In [15]:
vectors = np.hstack([tfidf_vectors, additional_features])

In [16]:
print(vectors.shape)

(1986, 15)


In [17]:
scaler = StandardScaler()
vectors_scaled = scaler.fit_transform(vectors)
dimension = vectors_scaled.shape[1]


In [18]:
print(dimension)

15


In [19]:
dimension

15

In [20]:
index = faiss.IndexFlatL2(dimension)
index.add(vectors_scaled)

In [21]:
print("Index size: ", index.ntotal)

Index size:  1986


In [22]:

# Eğitim veri setinizin özellik sayısını elde edin
feature_count = vectors.shape[1]

# Örnek sorgu verisi
query_vector = np.array([[1, 1691768400, 0, 200, 1024, 0, 1, 0, 0]], dtype=np.float32)

# Eğer sorgu vektörünüzün özellik sayısı eğitim veri setinizle uyumsuzsa, ek boş özellikler ekleyin
if query_vector.shape[1] < feature_count:
    # Eksik özellikleri sıfırlarla doldurun
    additional_features = np.zeros((query_vector.shape[0], feature_count - query_vector.shape[1]), dtype=np.float32)
    query_vector = np.hstack([query_vector, additional_features])

# Özellik sayısını doğrulama
print(f"Query vector features: {query_vector.shape[1]}")
print(f"Expected features: {feature_count}")

# Verileri standardize etme
query_vector_scaled = scaler.transform(query_vector)

print("Scaled Query Vector:", query_vector_scaled)


Query vector features: 15
Expected features: 15
Scaled Query Vector: [[ 2.2274029e+00  4.5085476e+09 -1.4385251e+02  5.3422192e+02
   2.7372822e+03 -5.0534439e-01 -8.6418039e-01 -1.6637075e+06
  -1.4347570e+00 -3.0641210e+00 -1.8922848e+00 -5.9402108e-01
  -5.7076061e-01 -5.7153594e-01 -5.7308650e-01]]


In [23]:
distances, indices = index.search(query_vector_scaled, k=5)

In [24]:
print("Distances:", distances)
print("Indices:", indices)

Distances: [[2.0327004e+19 2.0327004e+19 2.0327004e+19 2.0327004e+19 2.0327004e+19]]
Indices: [[0 1 2 3 4]]


In [25]:
openai.api_key = ''

In [26]:
def prepare_query_vector(query, vectorizer, scaler, target_dim):
    tfidf_vector = vectorizer.transform([query]).toarray().astype(np.float32)
    num_additional_features = target_dim - tfidf_vector.shape[1]
    other_features_vector = np.zeros((1, num_additional_features), dtype=np.float32)
    query_vector = np.hstack([tfidf_vector, other_features_vector])
    query_vector_scaled = scaler.transform(query_vector)
    return query_vector_scaled

In [27]:
def retrieve_similar_docs(query_vector, index, k=5):
    # Sorgu vektörünü uygun formata getirin
    query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)
    
    # En yakın k komşuyu bul
    distances, indices = index.search(query_vector, k)
    
    return distances[0], indices[0]


In [28]:
def rag_model(query, vectorizer, vectors, index, df, scaler, k=5):
  
    # Sorgu vektörü
    query_vector = prepare_query_vector(query, vectorizer, scaler, vectors.shape[1])
    
    # Benzer belgeler
    distances, indices = retrieve_similar_docs(query_vector, index, k)
    
    # Bağlam
    context = " ".join([str(df.iloc[i].to_dict()) for i in indices])
    
    # Yanıt
    answer = generate_answer(context, query)
    
    return answer


In [29]:
def generate_answer(context, query):
    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an assistant who searches for the correct answers to the questions asked in the database, finds them and answers them."},
            {"role": "user", "content": prompt}
        ]
    )
    
    return response.choices[0].message['content'].strip()


In [30]:
print(query_vector.shape)

(1, 15)


In [31]:
#pip install openai==0.28

In [32]:
query = "df de toplam Kaç satır veri var?"
answer = rag_model(query, vectorizer, vectors, index, df, scaler)
print(answer)

Toplamda 5 satır veri bulunmaktadır.


In [33]:
query = "En az kullanılan HTTP yöntemi nedir?"
answer = rag_model(query, vectorizer, vectors, index, df, scaler)
print(answer)

En az kullanılan HTTP yöntemi "DELETE" yöntemidir. Bu verilere göre DELETE yöntemi sadece bir kez kullanılmıştır.


In [34]:
query = "Yanıt boyutu 2500.0 ile 5000.0 arasinda olan isteklerden 3 tanesinin IP adreslerini, yollarını ve boyutlarini listele."
answer = rag_model(query, vectorizer, vectors, index, df, scaler)
print(answer)

1. IP Address: 172.16.0.1, Path: '/contact.html', Response Size: 4579.0
2. IP Address: 172.16.0.2, Path: '/contact.html', Response Size: 2923.0


In [35]:
query = "Data frame'deki En Yüksek response size nedir?"
answer = rag_model(query, vectorizer, vectors, index, df, scaler)
print(answer)

En yüksek response size, 4579.0 olarak ölçülmüştür.


In [36]:
query = "Response size 2422.0 olan kullanicinin IP adresi nedir"
answer = rag_model(query, vectorizer, vectors, index, df, scaler)
print(answer)

IP adresi, '10.0.0.2' olan kullanıcıdır.


In [37]:
query = "Delete metodunu kullanip about.html adresine erisen kullanicilarin IP adresleri nedir"
answer = rag_model(query, vectorizer, vectors, index, df, scaler)
print(answer)

Delete metodu kullanılarak about.html adresine erişen kullanıcıların IP adresleri '172.16.0.1' adresine sahiptir.


In [38]:
query = "data framedeki 302 koduna sahip contact.html adresine erisen kullanıcı veya kullanicilar hangi islemleri yapmistir, IP Adresleri ve Respose size nedir?"
answer = rag_model(query, vectorizer, vectors, index, df, scaler)
print(answer)

Kod 302'ye sahip olan '/contact.html' adresine erişen kullanıcı veya kullanıcılar HTTP GET methodunu kullanmışlardır. IP Adresleri ve Response Size değerleri ise aşağıdaki gibidir:

1. IP Adresi: 10.0.0.2, Response Size: 2422.0
2. IP Adresi: 172.16.0.2, Response Size: 2923.0


In [39]:
# Sorgular bu tip fonksiyonlar üzerinden de yapılabilir.
''' 
def query_function():
    while True:
        query = input("Sorunuzu girin (çıkmak için 'exit' yazın): ")
        if query.lower() == 'exit':
            print("Çıkış Yapıldı.")
            break
        answer = rag_model(query, vectorizer, vectors, index, df, scaler)
        print(f"Cevap: {answer}")

query_function()

'''


' \ndef query_function():\n    while True:\n        query = input("Sorunuzu girin (çıkmak için \'exit\' yazın): ")\n        if query.lower() == \'exit\':\n            print("Çıkış Yapıldı.")\n            break\n        answer = rag_model(query, vectorizer, vectors, index, df, scaler)\n        print(f"Cevap: {answer}")\n\nquery_function()\n\n'