# EX4-BATCH: Introduction to Spark programming with HDFS Logs

In [1]:
from pyspark.context import SparkContext

try:
    sc.stop()
except NameError:
    print("SparkContext not defined")

sc = SparkContext(appName="HDFS_Log_Analysis", master="local[*]")

SparkContext not defined


### Loading HDFS Log Data

In [2]:
hdfs_logs = [
    "081109 203432 154 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.10.6:40524 dest: /10.250.10.6:50010",
    "081109 203432 156 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 67108864 from /10.250.10.6",
    "081109 203432 157 WARN dfs.DataNode$DataXceiver: Slow receiver detected",
    "081109 203433 158 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862907 src: /10.250.10.6:40524 dest: /10.250.10.6:50010",
    "081109 203433 159 ERROR dfs.DataNode$DataXceiver: Error receiving block blk_-1608999687919862907"
]

logs_rdd = sc.parallelize(hdfs_logs)
print("Total log lines:", logs_rdd.count())
print("First 3 lines:")
for line in logs_rdd.take(3):
    print(line)

Total log lines: 5
First 3 lines:
081109 203432 154 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.10.6:40524 dest: /10.250.10.6:50010
081109 203432 156 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 67108864 from /10.250.10.6
081109 203432 157 WARN dfs.DataNode$DataXceiver: Slow receiver detected


### Advanced HDFS Log Analysis Examples

**1. Extract Block IDs from Logs**

In [3]:
import re

def extract_block_id(line):
    match = re.search(r'blk_[-0-9]+', line)
    return match.group(0) if match else None

block_ids = logs_rdd.map(extract_block_id).filter(lambda x: x is not None).distinct()
print("Block IDs in logs:")
for block_id in block_ids.collect():
    print(block_id)

Block IDs in logs:
blk_-1608999687919862906
blk_-1608999687919862907


**2. Count Log Messages by Type**

In [4]:
log_levels = logs_rdd.map(lambda line: line.split()[3])
level_counts = log_levels.countByValue()
print("Log level counts:")
for level, count in level_counts.items():
    print(f"{level}: {count}")

Log level counts:
INFO: 3
WARN: 1
ERROR: 1


**3. Find Error Messages**

In [5]:
error_logs = logs_rdd.filter(lambda line: "ERROR" in line)
print("Error messages:")
for log in error_logs.collect():
    print(log)

Error messages:
081109 203433 159 ERROR dfs.DataNode$DataXceiver: Error receiving block blk_-1608999687919862907


**4. Extract Timestamps and Create Time Series**

In [6]:
timestamps = logs_rdd.map(lambda line: ' '.join(line.split()[0:2]))
print("Log timestamps:")
for ts in timestamps.collect():
    print(ts)

Log timestamps:
081109 203432
081109 203432
081109 203432
081109 203433
081109 203433


**5. Analyze Block Transfer Sizes**

In [7]:
def extract_size(line):
    match = re.search(r'size (\d+)', line)
    return int(match.group(1)) if match else None

block_sizes = logs_rdd.map(extract_size).filter(lambda x: x is not None)
print("Block sizes mentioned:")
for size in block_sizes.collect():
    print(size)

Block sizes mentioned:
67108864


**6. Count Events by Source IP**

In [8]:
def extract_src_ip(line):
    match = re.search(r'src: (\S+)', line)
    return match.group(1) if match else None

src_ips = logs_rdd.map(extract_src_ip).filter(lambda x: x is not None)
ip_counts = src_ips.countByValue()
print("Events by source IP:")
for ip, count in ip_counts.items():
    print(f"{ip}: {count}")

Events by source IP:
/10.250.10.6: 5


**7. Find Slow Operations**

In [9]:
slow_ops = logs_rdd.filter(lambda line: "Slow" in line or "WARN" in line)
print("Slow operations detected:")
for op in slow_ops.collect():
    print(op)

Slow operations detected:
081109 203432 157 WARN dfs.DataNode$DataXceiver: Slow receiver detected


**8. Calculate Average Time Between Events**

In [10]:
from datetime import datetime

def parse_time(line):
    time_str = ' '.join(line.split()[0:2])
    return datetime.strptime(time_str, "%y%m%d %H%M%S")

times = logs_rdd.map(parse_time).collect()
diffs = [(times[i+1] - times[i]).seconds for i in range(len(times)-1)]
avg_diff = sum(diffs)/len(diffs) if diffs else 0
print(f"Average time between events (seconds): {avg_diff}")

Average time between events (seconds): 1.25


In [11]:
sc.stop()