# PCA of Web Log

In [20]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("web_logs.csv")
df.head()

Unnamed: 0,timestamp,ip_address,user_agent,status_code,request_method,url,response_time,log_level
0,2025-02-13 06:58:04.072836,75.104.45.87,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7),401,DELETE,/admin,1522.63,INFO
1,2025-02-13 07:07:26.072836,168.5.11.122,Mozilla/5.0 (Windows NT 10.0; Win64; x64),403,GET,/admin,302.51,INFO
2,2025-02-13 07:09:44.072836,112.166.232.25,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7),502,POST,/login,1871.98,INFO
3,2025-02-13 07:10:13.072836,220.145.129.191,Mozilla/5.0 (Linux; Android 11),200,POST,/home,2746.75,INFO
4,2025-02-13 07:19:48.072836,41.150.116.145,Mozilla/5.0 (Windows NT 10.0; Win64; x64),201,DELETE,/search?q=test,1491.42,INFO


In [21]:
df['timestamp'] = pd.to_datetime(df['timestamp']).astype(int) / 10**9  # Convert to seconds

label_encoders = {}
categorical_features = ['ip_address', 'user_agent', 'request_method', 'url', 'log_level']

for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()

Unnamed: 0,timestamp,ip_address,user_agent,status_code,request_method,url,response_time,log_level
0,1739430000.0,13440,1,401,0,0,1522.63,2
1,1739430000.0,4605,2,403,1,0,302.51,2
2,1739431000.0,1005,1,502,2,4,1871.98,2
3,1739431000.0,7947,0,200,2,3,2746.75,2
4,1739431000.0,11332,2,201,0,7,1491.42,2


In [22]:
# import pandas as pd
# from sklearn.preprocessing import OneHotEncoder

# # Convert timestamp to seconds
# df['timestamp'] = pd.to_datetime(df['timestamp']).astype(int) / 10**9

# # Define categorical features
# categorical_features = ['ip_address', 'user_agent', 'request_method', 'url', 'log_level']

# # Apply One-Hot Encoding
# encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity
# encoded_data = encoder.fit_transform(df[categorical_features])

# # Convert encoded array to DataFrame
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))

# # Drop original categorical columns and merge encoded data
# df = df.drop(columns=categorical_features)
# df = pd.concat([df, encoded_df], axis=1)

# df.head()


In [23]:
features = ['ip_address', 'user_agent', 'request_method', 'url', 'log_level', 'response_time', 'status_code']
X = df[features]

In [24]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()

# X_scaled = scaler.fit_transform(features)

In [25]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 3)

X_pca = pca.fit_transform(X)

In [26]:
X_pca = pd.DataFrame(pca.fit_transform(df[features]), columns=[f'PCA_{i+1}' for i in range(pca.n_components_)])
X_pca.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3
0,5940.515518,7.182028,17.984975
1,-2896.09278,-1201.234866,20.019394
2,-6494.006256,372.972567,119.653641
3,449.132783,1238.637119,-182.398828
4,3832.467098,-21.175581,-181.927397


In [27]:
X_pca.to_csv('WebLogReduced.csv', index = False)

In [28]:
pca.explained_variance_ratio_

array([9.61122963e-01, 3.82826884e-02, 5.93818521e-04])

In [29]:
from sklearn.ensemble import IsolationForest

X = X_pca[['PCA_1', 'PCA_2', 'PCA_3']]
iso_forest = IsolationForest(contamination=0.10, random_state=42)
X_pca['anomaly'] = iso_forest.fit_predict(X)

In [30]:
X_pca.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,anomaly
0,5940.515518,7.182028,17.984975,1
1,-2896.09278,-1201.234866,20.019394,1
2,-6494.006256,372.972567,119.653641,1
3,449.132783,1238.637119,-182.398828,1
4,3832.467098,-21.175581,-181.927397,1


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Select features and target
X = X_pca[['PCA_1', 'PCA_2', 'PCA_3']]
y = X_pca['anomaly']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = knn.predict(X_test_scaled)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.97
Classification Report:
               precision    recall  f1-score   support

          -1       0.88      0.87      0.87       304
           1       0.99      0.99      0.99      2696

    accuracy                           0.97      3000
   macro avg       0.93      0.93      0.93      3000
weighted avg       0.97      0.97      0.97      3000



In [32]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score


cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 264   40]
 [  37 2659]]


In [33]:
precision = precision_score(y_test, y_pred)

print(precision)

0.9851796961837718


In [34]:
recall = recall_score(y_test, y_pred)

print(recall)

0.9862759643916914


# PCA of Access Log

In [42]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


In [43]:
df1 = pd.read_csv('accesslog.csv')
df1.head()

Unnamed: 0,ip,ident,user,timestamp,request,status,size
0,1.202.218.8,-,-,2012-06-20T19:05:12+02:00,GET /robots.txt HTTP/1.0,404,492
1,208.115.113.91,-,-,2012-06-20T19:20:16+02:00,GET /logs/?C=M;O=D HTTP/1.1,200,1278
2,123.125.71.20,-,-,2012-06-20T19:30:40+02:00,GET / HTTP/1.1,200,912
3,220.181.108.101,-,-,2012-06-20T19:31:01+02:00,GET / HTTP/1.1,200,912
4,123.125.68.79,-,-,2012-06-20T19:53:24+02:00,GET / HTTP/1.1,200,625


In [44]:
df1 = df1.drop(['ident', 'user'], axis = 1)
df1.head()

Unnamed: 0,ip,timestamp,request,status,size
0,1.202.218.8,2012-06-20T19:05:12+02:00,GET /robots.txt HTTP/1.0,404,492
1,208.115.113.91,2012-06-20T19:20:16+02:00,GET /logs/?C=M;O=D HTTP/1.1,200,1278
2,123.125.71.20,2012-06-20T19:30:40+02:00,GET / HTTP/1.1,200,912
3,220.181.108.101,2012-06-20T19:31:01+02:00,GET / HTTP/1.1,200,912
4,123.125.68.79,2012-06-20T19:53:24+02:00,GET / HTTP/1.1,200,625


In [45]:
df1['timestamp'] = pd.to_datetime(df1['timestamp']).astype(int)/10**9

In [46]:
label_encoders = {}
categorical_features = ['ip','timestamp','request']

for col in categorical_features:
  le = LabelEncoder()
  df1[col] = le.fit_transform(df1[col])
  label_encoders[col] = le

df1.head()


Unnamed: 0,ip,timestamp,request,status,size
0,0,0,110,404,492
1,265,1,84,200,1278
2,59,2,1,200,912
3,297,3,1,200,912
4,42,4,1,200,625


In [51]:
features = ['ip','timestamp','request','status','size']
X = df1[features]

In [52]:
pca = PCA(n_components = 3)
X_pca = pca.fit_transform(X)

In [54]:
X_pca = pd.DataFrame(pca.fit_transform(X), columns=[f'PCA_{i+1}' for i in range(pca.n_components_)])
X_pca.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3
0,-12420.072303,-870.987114,-280.32579
1,-11634.015661,-874.202165,20.837294
2,-12000.023958,-872.643721,-175.736762
3,-12000.029827,-871.992279,57.128731
4,-12287.019499,-870.123014,-192.352824


In [55]:
X_pca.to_csv('AccessLogReduced.csv', index = False)

In [56]:
pca.explained_variance_ratio_

array([9.99877175e-01, 1.09547671e-04, 9.01541815e-06])

In [57]:
X = X_pca[['PCA_1', 'PCA_2', 'PCA_3']]
iso_forest = IsolationForest(contamination=0.10, random_state=42)
X_pca['anomaly'] = iso_forest.fit_predict(X)

In [58]:
X_pca.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,anomaly
0,-12420.072303,-870.987114,-280.32579,-1
1,-11634.015661,-874.202165,20.837294,-1
2,-12000.023958,-872.643721,-175.736762,-1
3,-12000.029827,-871.992279,57.128731,1
4,-12287.019499,-870.123014,-192.352824,-1


In [59]:
# Select features and target
X = X_pca[['PCA_1', 'PCA_2', 'PCA_3']]
y = X_pca['anomaly']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = knn.predict(X_test_scaled)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.99


In [60]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 31   3]
 [  1 386]]


In [61]:
precision = precision_score(y_test, y_pred)

print(precision)

0.9922879177377892


In [62]:
recall = recall_score(y_test, y_pred)

print(recall)

0.9974160206718347


# PCA for Window Log

In [81]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


In [82]:
df2 = pd.read_csv('Windows_2k.log_structured.csv')
df2.head()

Unnamed: 0,LineId,Date,Time,Level,Component,Content,EventId,EventTemplate
0,1,2016-09-28,04:30:30,Info,CBS,Loaded Servicing Stack v6.1.7601.23505 with Co...,E23,Loaded Servicing Stack <*> with Core: <*>\cbsc...
1,2,2016-09-28,04:30:31,Info,CSI,00000001@2016/9/27:20:30:31.455 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...
2,3,2016-09-28,04:30:31,Info,CSI,00000002@2016/9/27:20:30:31.458 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...
3,4,2016-09-28,04:30:31,Info,CSI,00000003@2016/9/27:20:30:31.458 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...
4,5,2016-09-28,04:30:31,Info,CBS,Ending TrustedInstaller initialization.,E17,Ending TrustedInstaller initialization.


In [83]:
df2['TimeStamp'] = pd.to_datetime(df2['Date']).astype(str)+' '+df2['Time'].astype(str)

df2['TimeStamp'] = pd.to_datetime(df2['TimeStamp']).astype('int64') // 10**9

In [84]:
df2.head()

Unnamed: 0,LineId,Date,Time,Level,Component,Content,EventId,EventTemplate,TimeStamp
0,1,2016-09-28,04:30:30,Info,CBS,Loaded Servicing Stack v6.1.7601.23505 with Co...,E23,Loaded Servicing Stack <*> with Core: <*>\cbsc...,1475037030
1,2,2016-09-28,04:30:31,Info,CSI,00000001@2016/9/27:20:30:31.455 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...,1475037031
2,3,2016-09-28,04:30:31,Info,CSI,00000002@2016/9/27:20:30:31.458 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...,1475037031
3,4,2016-09-28,04:30:31,Info,CSI,00000003@2016/9/27:20:30:31.458 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...,1475037031
4,5,2016-09-28,04:30:31,Info,CBS,Ending TrustedInstaller initialization.,E17,Ending TrustedInstaller initialization.,1475037031


In [85]:
df2 = df2.drop(['LineId','Date','Time'],axis = 1)
df2.head()

Unnamed: 0,Level,Component,Content,EventId,EventTemplate,TimeStamp
0,Info,CBS,Loaded Servicing Stack v6.1.7601.23505 with Co...,E23,Loaded Servicing Stack <*> with Core: <*>\cbsc...,1475037030
1,Info,CSI,00000001@2016/9/27:20:30:31.455 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...,1475037031
2,Info,CSI,00000002@2016/9/27:20:30:31.458 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...,1475037031
3,Info,CSI,00000003@2016/9/27:20:30:31.458 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...,1475037031
4,Info,CBS,Ending TrustedInstaller initialization.,E17,Ending TrustedInstaller initialization.,1475037031


In [86]:
label_encoders = {}
categorical_features = ['Level','Component','Content','EventId','EventTemplate']

for col in categorical_features:
  le = LabelEncoder()
  df2[col] = le.fit_transform(df2[col])
  label_encoders[col] = le

df2.head()

Unnamed: 0,Level,Component,Content,EventId,EventTemplate,TimeStamp
0,0,0,35,15,22,1475037030
1,0,1,0,4,12,1475037031
2,0,1,2,4,12,1475037031
3,0,1,4,4,12,1475037031
4,0,0,28,8,15,1475037031


In [87]:
features = ['Level','Component','Content','EventId','EventTemplate','TimeStamp']
X = df2[features]

In [88]:
pca = PCA(n_components = 3)
X_pca = pca.fit_transform(X)

In [89]:
X_pca = pd.DataFrame(pca.fit_transform(X), columns=[f'PCA_{i+1}' for i in range(pca.n_components_)])
X_pca.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3
0,-40092.195287,-315.958146,-0.946176
1,-40091.239646,-351.553033,-14.271696
2,-40091.237111,-349.554814,-14.354566
3,-40091.234576,-347.556595,-14.437436
4,-40091.204156,-323.370396,-10.434054


In [90]:
X_pca.to_csv('WindowLogReduced.csv', index = False)

In [91]:
pca.explained_variance_ratio_

array([9.99916665e-01, 8.33082308e-05, 2.00137307e-08])

In [92]:
X = X_pca[['PCA_1', 'PCA_2', 'PCA_3']]
iso_forest = IsolationForest(contamination=0.10, random_state=42)
X_pca['anomaly'] = iso_forest.fit_predict(X)

In [93]:
X_pca.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,anomaly
0,-40092.195287,-315.958146,-0.946176,1
1,-40091.239646,-351.553033,-14.271696,-1
2,-40091.237111,-349.554814,-14.354566,-1
3,-40091.234576,-347.556595,-14.437436,-1
4,-40091.204156,-323.370396,-10.434054,1


In [94]:
# Select features and target
X = X_pca[['PCA_1', 'PCA_2', 'PCA_3']]
y = X_pca['anomaly']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = knn.predict(X_test_scaled)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.98


In [95]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 27   6]
 [  2 365]]


In [96]:
precision = precision_score(y_test, y_pred)

print(precision)

0.9838274932614556


In [97]:
recall = recall_score(y_test, y_pred)

print(recall)

0.9945504087193461


# Merge Data and apply Isolation Forest

In [98]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


In [99]:
dataset1 = pd.read_csv('WebLogReduced.csv')
dataset2 = pd.read_csv('AccessLogReduced.csv')
dataset3 = pd.read_csv('WindowLogReduced.csv')

In [100]:
dataset1

Unnamed: 0,PCA_1,PCA_2,PCA_3
0,5940.515518,7.182028,17.984975
1,-2896.092780,-1201.234866,20.019394
2,-6494.006256,372.972567,119.653641
3,449.132783,1238.637119,-182.398828
4,3832.467098,-21.175581,-181.927397
...,...,...,...
14995,-3758.879156,-284.471553,21.333326
14996,-5287.812136,-987.675757,17.190592
14997,6782.050747,407.288217,117.067861
14998,56.316927,1377.917527,-182.339332


In [101]:
dataset2

Unnamed: 0,PCA_1,PCA_2,PCA_3
0,-12420.072303,-870.987114,-280.325790
1,-11634.015661,-874.202165,20.837294
2,-12000.023958,-872.643721,-175.736762
3,-12000.029827,-871.992279,57.128731
4,-12287.019499,-870.123014,-192.352824
...,...,...,...
2100,324024.613674,368.677899,6.965426
2101,-12406.932340,953.228779,-20.274081
2102,-12406.930609,954.228699,-20.270098
2103,-12011.862573,952.322559,-225.215222


In [102]:
dataset3

Unnamed: 0,PCA_1,PCA_2,PCA_3
0,-40092.195287,-315.958146,-0.946176
1,-40091.239646,-351.553033,-14.271696
2,-40091.237111,-349.554814,-14.354566
3,-40091.234576,-347.556595,-14.437436
4,-40091.204156,-323.370396,-10.434054
...,...,...,...
1995,37557.975559,-230.101566,3.927546
1996,37558.898339,497.912271,-11.510612
1997,37557.950208,-250.083756,4.756245
1998,37558.899606,498.911380,-11.552047


In [103]:
merge_dataset = pd.concat([dataset1,dataset2,dataset3], ignore_index = True)
merge_dataset

Unnamed: 0,PCA_1,PCA_2,PCA_3
0,5940.515518,7.182028,17.984975
1,-2896.092780,-1201.234866,20.019394
2,-6494.006256,372.972567,119.653641
3,449.132783,1238.637119,-182.398828
4,3832.467098,-21.175581,-181.927397
...,...,...,...
19100,37557.975559,-230.101566,3.927546
19101,37558.898339,497.912271,-11.510612
19102,37557.950208,-250.083756,4.756245
19103,37558.899606,498.911380,-11.552047


In [104]:
X = X_pca[['PCA_1', 'PCA_2', 'PCA_3']]
iso_forest = IsolationForest(contamination=0.20, random_state=42)
X_pca['anomaly'] = iso_forest.fit_predict(X)

In [105]:
X_pca.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,anomaly
0,-40092.195287,-315.958146,-0.946176,1
1,-40091.239646,-351.553033,-14.271696,-1
2,-40091.237111,-349.554814,-14.354566,-1
3,-40091.234576,-347.556595,-14.437436,-1
4,-40091.204156,-323.370396,-10.434054,-1


In [106]:
# Select features and target
X = X_pca[['PCA_1', 'PCA_2', 'PCA_3']]
y = X_pca['anomaly']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = knn.predict(X_test_scaled)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.96


In [107]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 66   7]
 [  7 320]]


In [108]:
precision = precision_score(y_test, y_pred)

print(precision)

0.9785932721712538


In [109]:
recall = recall_score(y_test, y_pred)

print(recall)

0.9785932721712538
