In [1]:
%%writefile weblogs.txt
# Date, Time, IP, Method, URL, Status, ResponseSize
2025-10-10,12:01:32,192.168.1.2,GET,/index.html,200,1024
2025-10-10,12:01:33,192.168.1.3,GET,/products.html,200,850
2025-10-10,12:01:35,192.168.1.4,GET,/contact.html,404,512
2025-10-10,12:01:38,192.168.1.5,POST,/checkout,500,128
2025-10-10,12:01:41,192.168.1.6,GET,/index.html,200,1024
2025-10-10,12:01:45,192.168.1.7,GET,/images/logo.png,200,256
2025-10-10,12:01:48,192.168.1.8,GET,/about.html,404,512
2025-10-10,12:01:53,192.168.1.9,POST,/login,403,64
2025-10-10,12:02:01,192.168.1.10,GET,/index.html,200,1024
2025-10-10,12:02:07,192.168.1.11,POST,/checkout,500,128
2025-10-10,12:02:12,192.168.1.12,GET,/contact.html,404,512
2025-10-10,12:02:15,192.168.1.13,GET,/index.html,200,1024
2025-10-10,12:02:21,192.168.1.14,GET,/products.html,200,850
2025-10-10,12:02:23,192.168.1.15,GET,/about.html,404,512
2025-10-10,12:02:29,192.168.1.16,POST,/checkout,500,128
2025-10-10,12:02:31,192.168.1.17,GET,/images/logo.png,200,256
2025-10-10,12:02:34,192.168.1.18,GET,/contact.html,404,512
2025-10-10,12:02:38,192.168.1.19,POST,/login,403,64
2025-10-10,12:02:41,192.168.1.20,GET,/index.html,200,1024
2025-10-10,12:02:47,192.168.1.21,GET,/products.html,200,850


Writing weblogs.txt


**Implement the Mapper**

In [2]:
# Mapper: Extract (StatusCode, 1)
def mapper(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    # Date, Time, IP, Method, URL, Status, ResponseSize
    status = fields[5]          # هذا هو كود الحالة HTTP (200, 404, 500, ...)
    return [(status, 1)]


In [4]:
print(mapper("2025-10-10,12:01:32,192.168.1.2,GET,/index.html,200,1024"))

[('200', 1)]


**Shuffle Phase**

In [5]:
from collections import defaultdict

def shuffle(mapped_data):
    grouped = defaultdict(list)
    for key, value in mapped_data:
        grouped[key].append(value)   # نستخدم key كما هو
    return grouped


**Reducer Phase**

In [6]:
from collections import defaultdict

def reducer(mapped_data):
    grouped = defaultdict(int)
    for key, value in mapped_data:
        grouped[key] += value       # نجمع عدد الطلبات لكل status
    return grouped


**Combine the Phases**

In [8]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper(line))

#  نستخدم reducer مباشرة على mapped
reduced = reducer(mapped)

for code, count in sorted(reduced.items()):
    print(f"HTTP {code}: {count} requests")


HTTP 200: 10 requests
HTTP 403: 2 requests
HTTP 404: 5 requests
HTTP 500: 3 requests


### **Bonus**

In [9]:
def mapper_url(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []

    # Date, Time, IP, Method, URL, Status, ResponseSize
    url = fields[4]
    return [(url, 1)]


In [10]:
print(mapper_url("2025-10-10,12:01:32,192.168.1.2,GET,/index.html,200,1024"))


[('/index.html', 1)]


**Map + Reduce:**

In [11]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper_url(line))

reduced = reducer(mapped)

print("Requests per URL:")
for url, count in sorted(reduced.items()):
    print(f"{url:20s} -> {count:2d} requests")


Requests per URL:
/about.html          ->  2 requests
/checkout            ->  3 requests
/contact.html        ->  3 requests
/images/logo.png     ->  2 requests
/index.html          ->  5 requests
/login               ->  2 requests
/products.html       ->  3 requests


**مجموع حجم الاستجابة (ResponseSize) لكل Status**

In [12]:
def mapper_status_bytes(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []

    status = fields[5]
    try:
        size = int(fields[6])
    except ValueError:
        size = 0   # لو فيه خطأ في الحجم

    return [(status, size)]


In [13]:
print(mapper_status_bytes("2025-10-10,12:01:32,192.168.1.2,GET,/index.html,200,1024"))


[('200', 1024)]


In [14]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper_status_bytes(line))

reduced = reducer(mapped)

print("Total response size per Status:")
for status, total_size in sorted(reduced.items()):
    print(f"HTTP {status}: {total_size} bytes")


Total response size per Status:
HTTP 200: 8182 bytes
HTTP 403: 128 bytes
HTTP 404: 2560 bytes
HTTP 500: 384 bytes


**Bonus 3: تحليل الأخطاء فقط (4xx و 5xx)**

In [15]:
def mapper_errors(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []

    status = fields[5]

    # نأخذ فقط الأكواد التي تبدأ بـ 4 أو 5
    if not (status.startswith('4') or status.startswith('5')):
        return []

    return [(status, 1)]


In [16]:
print(mapper_errors("2025-10-10,12:01:35,192.168.1.4,GET,/contact.html,404,512"))

print(mapper_errors("2025-10-10,12:01:32,192.168.1.2,GET,/index.html,200,1024"))


[('404', 1)]
[]


In [17]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper_errors(line))

reduced = reducer(mapped)

print("Error requests only (4xx & 5xx):")
for status, count in sorted(reduced.items()):
    print(f"HTTP {status}: {count} error requests")


Error requests only (4xx & 5xx):
HTTP 403: 2 error requests
HTTP 404: 5 error requests
HTTP 500: 3 error requests
