In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, from_json, date_format, udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import json

# 1. Spark session
spark = SparkSession.builder \
    .appName("Flagged Message Monitor") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

# 2. Load marked words
with open("/home/hadoop/Downloads/bootcamp-project/data/marked_word.json") as f:
    marked_words = json.load(f)

# 3. Kafka message schema
message_schema = StructType([
    StructField("sender", StringType()),
    StructField("receiver", StringType()),
    StructField("message", StringType())
])

# 4. Read from Kafka
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "employee-messages") \
    .option("startingOffsets", "latest") \
    .load()

# 5. Parse and enrich
parsed_df = kafka_df.selectExpr("CAST(value AS STRING)") \
    .select(from_json("value", message_schema).alias("data")) \
    .select(
        col("data.sender").alias("sender"),
        col("data.receiver").alias("receiver"),
        col("data.message").alias("message_content"),
        current_timestamp().alias("timestamp")
    )

# 6. Calculate strike_count (number of marked words in message)
def count_marked_words(content, words):
    return sum(content.lower().count(word.lower()) for word in words)

# Register as UDF
count_marked_words_udf = udf(lambda x: count_marked_words(x, marked_words), IntegerType())

# Add strike_count
flagged_df = parsed_df.withColumn("strike_count", count_marked_words_udf(col("message_content"))) \
    .filter(col("strike_count") > 0) \
    .withColumn("date", date_format(col("timestamp"), "yyyy-MM-dd"))  # Extract date

# 7. Write flagged messages into the flagged_message_history table
def process_flagged_batch(batch_df, batch_id):
    if batch_df.rdd.isEmpty():
        return

    # Write flagged messages to flagged_message_history
    batch_df.select("sender", "message_content", "timestamp", "strike_count", "date") \
        .write \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://localhost:5432/employee_management_system") \
        .option("dbtable", "flagged_message_history") \
        .option("user", "postgres") \
        .option("password", "11223344") \
        .mode("append") \
        .save()

# 8. Stream processing with a trigger interval of 10 seconds
query = flagged_df.writeStream \
    .foreachBatch(process_flagged_batch) \
    .outputMode("append") \
    .trigger(processingTime="10 seconds") \
    .start()

query.awaitTermination()


In [None]:
#final morning
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Strike Count Last 30 Days") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

today_date_str = datetime.today().strftime('%Y-%m-%d')

# 1. Load active employee data
employee_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/employee_management_system") \
    .option("dbtable", "employee_time") \
    .option("user", "postgres") \
    .option("password", "11223344") \
    .load() \
    .filter(col("status") == "ACTIVE") \
    .select(
        col("emp_id").alias("employee_id")
    )

# 2. Load flagged messages from the last 30 days
flagged_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/employee_management_system") \
    .option("dbtable", "flagged_message_history") \
    .option("user", "postgres") \
    .option("password", "11223344") \
    .load() \
    .filter(col("timestamp") >= date_sub(current_date(), 30)) \
    .select(col("sender").alias("employee_id"), col("strike_flag"))

# 3. Count strikes per employee
strike_count_df = flagged_df.groupBy("employee_id").agg(sum("strike_flag").alias("strike_count"))

# 4. Join with active employees, fill missing strike counts with 0
final_df = employee_df.join(strike_count_df, on="employee_id", how="left") \
    .fillna(0, subset=["strike_count"]) \
    .withColumn("timestamp", lit(today_date_str))

# 5. Save strike counts
final_df.select("employee_id", "strike_count", "timestamp") \
    .write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/employee_management_system") \
    .option("dbtable", "strike_count_last_30_days") \
    .option("user", "postgres") \
    .option("password", "11223344") \
    .mode("overwrite") \
    .save()

print("✅ Strike count for last 30 days updated.")


In [None]:
#final daily

import psycopg2
from psycopg2 import sql
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql import DataFrame



spark = SparkSession.builder \
    .appName("Kryo Serialization Example") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrationRequired", "true") \
    .config("spark.kryo.classesToRegister", "org.apache.spark.examples.JavaPairRDD") \
    .getOrCreate()

today_date_str = datetime.today().strftime('%Y-%m-%d')
# 1. Load active employee data
employee_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/employee_management_system") \
    .option("dbtable", "employee_time") \
    .option("user", "postgres") \
    .option("password", "11223344") \
    .load() \
    .filter(col("status") == "ACTIVE") \
    .select(
        col("emp_id").alias("employee_id"),
        col("salary").alias("original_salary"),
        "designation"
    )

# 2. Load 30-day strike count
strike_30_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/employee_management_system") \
    .option("dbtable", "strike_count_last_30_days") \
    .option("user", "postgres") \
    .option("password", "11223344") \
    .load() \
    .select("employee_id", "strike_count") \
    .fillna({"strike_count": 0})

# 3. Load today's flagged messages and count strikes
todays_flagged_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/employee_management_system") \
    .option("dbtable", "flagged_message_history") \
    .option("user", "postgres") \
    .option("password", "11223344") \
    .load() \
    .filter(to_date(col("timestamp")) == today_date_str) \
    .select(col("sender").alias("employee_id"), col("strike_flag"))

todays_strike_count_df = todays_flagged_df.groupBy("employee_id") \
    .agg(sum("strike_flag").alias("todays_strike_count"))

# 4. Merge employee data with strike data
combined_df = employee_df \
    .join(strike_30_df, on="employee_id", how="left") \
    .join(todays_strike_count_df, on="employee_id", how="left") \
    .withColumn("strike_count", coalesce(col("strike_count"), lit(0))) \
    .withColumn("todays_strike_count", coalesce(col("todays_strike_count"), lit(0))) \
    .withColumn("updated_strike_count", col("strike_count") + col("todays_strike_count"))

# 5. Calculate salary after each strike
for i in range(1, 11):
    deduction_factor = 1 - 0.1 * i
    combined_df = combined_df.withColumn(
        f"salary_after_{i}_strike",
        when(col("updated_strike_count") >= i, round(col("original_salary") * deduction_factor, 2))
        .otherwise(lit(None))
    )

# 6. Final salary column
final_salary_expr = coalesce(
    *[col(f"salary_after_{i}_strike") for i in range(10, 0, -1)],
    col("original_salary")
)
combined_df = combined_df.withColumn("final_salary", final_salary_expr)

# 7. Add today's timestamp
combined_df = combined_df.withColumn("timestamp", lit(today_date_str))

# 8. Initialize status column to "ACTIVE"
combined_df = combined_df.withColumn("status", lit("ACTIVE"))

# 9. Mark employees with more than 9 strikes as INACTIVE
combined_df = combined_df.withColumn(
    "status", 
    when(col("updated_strike_count") > 9, lit("INACTIVE")).otherwise(col("status"))
)
combined_df.show()
# 10. Insert/update strike salary history table
combined_df.select(
    "employee_id", "updated_strike_count",
    *[f"salary_after_{i}_strike" for i in range(1, 11)],
    "final_salary", "original_salary", "designation", "timestamp"
).write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/employee_management_system") \
    .option("dbtable", "strike_salary_history") \
    .option("user", "postgres") \
    .option("password", "11223344") \
    .mode("overwrite") \
    .save()

# 11. Update status in employee_time table for employees with more than 9 strikes
import psycopg2
from psycopg2 import sql

# Database connection details
db_params = {
    "host": "localhost",
    "port": "5432",
    "database": "employee_management_system",
    "user": "postgres",
    "password": "11223344"
}


# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(**db_params)
cursor = conn.cursor()

# Step 1: Identify employees with more than 9 strikes
inactive_employee_ids = [row["employee_id"] for row in combined_df.filter(col("updated_strike_count") > 9).select("employee_id").collect()]

print(inactive_employee_ids)
if inactive_employee_ids:
    # Step 2: Update status for these employees to "INACTIVE"
    update_query = sql.SQL("""
        UPDATE employee_time
        SET status = 'INACTIVE' , end_date= today_date_str
        WHERE emp_id = %s
    """)

    # Step 3: Execute the update query for each employee_id
    for emp_id in inactive_employee_ids:
        cursor.execute(update_query, (emp_id,))
        print(update_query, (emp_id,))
    
    # Commit the transaction to apply the changes
    conn.commit()

# Close the cursor and connection
cursor.close()
conn.close()

print(f"Status updated for employees with more than 9 strikes.")


print("✅ Employees with more than 9 strikes have been marked as INACTIVE in employee_time.")


In [None]:
CREATE TABLE flagged_message_history (
    id SERIAL PRIMARY KEY,
    sender VARCHAR,
    receiver VARCHAR,
    message_content TEXT,
    strike_flag INT,
    timestamp TIMESTAMP

);

