In [33]:
import os
import sys
import pandas as pd
import numpy as np
import re


sys.path.append("..")

from logparser import LogParser

In [34]:
logfile_path = os.path.join(os.getcwd().replace("notebooks", ""), "logfiles/sample_access.log")

parser = LogParser()

In [35]:
parsed = parser.parse_log(logfile_path)
logs_df = parser.to_df(parsed)

In [36]:
log_format = r'(?P<remote_addr>\d+\.\d+\.\d+\.\d+)\s+\S+\s+\S+\s+\[(?P<time>[^\]]+)\]\s+"(?P<request>[^"]+)"\s+(?P<status>\d+)\s+(?P<bytes_sent>\d+)\s+"(?P<referer>[^"]+)+"\s+"(?P<user_agent>(?!http)[^"]*)"'

In [37]:
log_entries = parser.read_log_file(logfile_path)

In [38]:
log_entries[0]

'172.20.0.100 - - [29/Sep/2022:07:50:34 +0530] "GET /robots.txt HTTP/1.1" 404 84 "-" "Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)"\n'

In [39]:
log_string = log_entries[5]
print(log_string)
match = parser.match(log_format, log_string)
parser.print_matches(match)

203.0.113.55 - - [29/Sep/2022:07:54:43 +0530] "GET /?page=12 HTTP/1.1" 200 4266 "-" "Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)"

remote_addr:203.0.113.55
time:29/Sep/2022:07:54:43 +0530
request:GET /?page=12 HTTP/1.1
status:200
bytes_sent:4266
referer:-
user_agent:Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)


In [40]:
# Total distinct HTTP status codes
status_count = logs_df["status"].value_counts().reset_index()
status_count.columns = ["status", "count"]
status_count

Unnamed: 0,status,count
0,404,11
1,200,5
2,301,4
3,400,2


In [41]:
# Count the occurrences of each URL
requests_count = logs_df["request"].value_counts().reset_index()
requests_count.columns = ["request", "count"]
requests_count

Unnamed: 0,request,count
0,GET /robots.txt HTTP/1.1,5
1,GET / HTTP/1.1,3
2,GET /wp-commentin.php HTTP/1.1,2
3,GET /?page=6 HTTP/1.1,1
4,GET /public/knowledge/products/electric-bikes/...,1
5,GET /public/knowledge/industrial-data/hydroele...,1
6,GET /?page=12 HTTP/1.1,1
7,\x16\x03\x01\x00{\x01\x00\x00w\x03\x03>\xF2*\x...,1
8,\x16\x03\x01\x00{\x01\x00\x00w\x03\x03\xF0\xDF...,1
9,GET /.well-known/traffic-advice HTTP/1.1,1


In [42]:
#unique ips
remote_ips = logs_df["ip"].unique().tolist()
remote_ips_df = pd.DataFrame(remote_ips)
remote_ips_df.columns = ["ip"]
remote_ips_df

Unnamed: 0,ip
0,172.20.0.100
1,177.20.0.100
2,203.0.113.55
3,192.168.1.254
4,192.0.2.123
5,192.168.100.1
6,89.249.93.82
7,90.239.58.10
8,21.207.82.15
9,55.67.188.47


In [43]:
#requests with status code 404
logs_df.loc[(logs_df["status"] == "404")]

Unnamed: 0,ip,time,request,status,bytes_sent,referer,user_agent
0,172.20.0.100,29/Sep/2022:07:50:34 +0530,GET /robots.txt HTTP/1.1,404,84,-,Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http:...
4,203.0.113.55,29/Sep/2022:07:54:37 +0530,GET /robots.txt HTTP/1.1,404,84,-,Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http:...
7,203.0.113.55,29/Sep/2022:07:54:45 +0530,GET /robots.txt HTTP/1.1,404,84,-,Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http:...
9,192.168.1.254,29/Sep/2022:07:55:45 +0530,GET /wp-commentin.php HTTP/1.1,404,4390,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
10,192.0.2.123,29/Sep/2022:07:56:13 +0530,GET /robots.txt HTTP/1.1,404,1245,-,Mozilla/5.0 (compatible;PetalBot;+https://webm...
13,89.249.93.82,29/Sep/2022:09:15:22 +0530,GET /.well-known/traffic-advice HTTP/1.1,404,121,-,Chrome Privacy Preserving Prefetch Proxy
16,21.207.82.15,29/Sep/2022:09:42:01 +0530,GET /news/wp-includes/wlwmanifest.xml HTTP/1.1,404,580,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
17,21.207.82.15,29/Sep/2022:09:42:01 +0530,GET /2020/wp-includes/wlwmanifest.xml HTTP/1.1,404,580,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
18,21.207.82.15,29/Sep/2022:09:42:02 +0530,GET /2019/wp-includes/wlwmanifest.xml HTTP/1.1,404,580,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
19,21.207.82.15,29/Sep/2022:09:42:02 +0530,GET /shop/wp-includes/wlwmanifest.xml HTTP/1.1,404,580,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...


In [44]:
#unique user agents

user_agent =  logs_df["user_agent"].unique().tolist()
user_agent_df = pd.DataFrame(user_agent)
user_agent_df.columns = ["user_agent"]
user_agent_df


Unnamed: 0,user_agent
0,Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http:...
1,Mozilla/5.0 (Linux; Android 9; Redmi 6 Pro) Ap...
2,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
3,Mozilla/5.0 (compatible;PetalBot;+https://webm...
4,-
5,Chrome Privacy Preserving Prefetch Proxy
6,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
7,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
8,IonCrawl (https://www.ionos.de/terms-gtc/faq-c...


In [45]:
parser.failures()

[]