# NTI-Graduation-Project
### This File For The python script to handel the logs presentation with pyspark.

#### Import PySpark

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import explode, split

#### Create SparkSession

In [6]:
spark = SparkSession.builder \
    .appName("GraduationLogAnalysis") \
    .getOrCreate()

#### Read log data from HDFS

In [7]:
hdfs_path = "hdfs://localhost:9000/user/bigdata/graduation/"
logs_df = spark.read.text(hdfs_path)

#### Show sample data

In [8]:
logs_df.show(10, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                |
+-------------------------------------------------------------------------------------------------------------------------------------+
|2025-07-17 07:48:23,196 2762 log_generator.py 145 log-generator DEBUG Tarkin turns to the computer readout screen. Flames move       |
|2025-07-17 07:48:23,564 2762 log_generator.py 145 log-generator DEBUG She lied! She lied to us!                                      |
|2025-07-17 07:48:23,780 2762 log_generator.py 145 log-generator DEBUG feet from them and assumes a defensive position. The troops    |
|2025-07-17 07:48:24,135 2762 log_generator.py 145 log-generator DEBUG Look, I can take you as far as                                 |
|2025-07-17 07:48:24,448 2762 log_generator.py 1

#### Regular Expression Example: After DEBUG
#### Example log line: '2025-07-17 07:34:02,537 ... DEBUG message'

In [10]:
logs_cleaned = logs_df.withColumn("timestamp", regexp_extract(col("value"), r"^([\d\-]+\s[\d:,]+)", 1)) \
    .withColumn("log_level", regexp_extract(col("value"), r"\s(DEBUG|INFO|ERROR|WARN)\s", 1)) \
    .withColumn("message", regexp_extract(col("value"), r"DEBUG\s(.*)", 1))

logs_cleaned.select("timestamp", "log_level", "message").show(10, truncate=False)

+-----------------------+---------+---------------------------------------------------------------+
|timestamp              |log_level|message                                                        |
+-----------------------+---------+---------------------------------------------------------------+
|2025-07-17 07:48:23,196|DEBUG    |Tarkin turns to the computer readout screen. Flames move       |
|2025-07-17 07:48:23,564|DEBUG    |She lied! She lied to us!                                      |
|2025-07-17 07:48:23,780|DEBUG    |feet from them and assumes a defensive position. The troops    |
|2025-07-17 07:48:24,135|DEBUG    |Look, I can take you as far as                                 |
|2025-07-17 07:48:24,448|DEBUG    |his field of view. Before Luke or Threepio can react, a large, |
|2025-07-17 07:48:25,332|DEBUG    |The ship begins to rock violently as lasers hit it.            |
|2025-07-17 07:48:26,301|DEBUG    |Our passengers must be hotter than I                           |


#### The Most Word Repeated in the logs

In [13]:
words = logs_cleaned.select(
    explode(split(col("message"), "\s+")).alias("word")
)

words.groupBy("word").count().orderBy("count", ascending=False).show(10)

+----+-----+
|word|count|
+----+-----+
|    |  685|
| the|  499|
|  to|  208|
|   a|  190|
| and|  169|
|  of|  144|
|  in|  108|
|   -|  102|
|   I|  101|
| his|   91|
+----+-----+
only showing top 10 rows



#### Saving The result in csv dir on your path

In [None]:
logs_cleaned.select("timestamp", "log_level", "message") \
    .coalesce(1) \
    .write.mode("overwrite") \
    .option("header", True) \
    .csv("file:///home/bigdata/Desktop/output/cleaned_logs/") ##! replaced with your path
