**WORKING WITH CSV DATA**

In [0]:
# 1. Load the CSV Data
# Load the CSV data into a DataFrame
df_employee = spark.read.csv("file:/Workspace/Users/azuser2115_mml.local@techademy.com/employee_data.txt", header=True)

# Display the first 10 rows and inspect the schema
df_employee.show(10)
df_employee.printSchema()


+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoiningDate: string (nullable = true)
 |-- Salary: string (nullable = true)



In [0]:
# 2. Data Cleaning
# Remove rows where Salary is less than 55,000
from pyspark.sql.functions import year, col
df_cleaned_salary = df_employee.filter(col('Salary') >= 55000)

# Filter employees who joined after the year 2020
df_cleaned_joining = df_cleaned_salary.filter(year(col('JoiningDate')) > 2020)

# Show the cleaned DataFrame
df_cleaned_joining.show()


+----------+------------+----------+-----------+------+
|EmployeeID|        Name|Department|JoiningDate|Salary|
+----------+------------+----------+-----------+------+
|      1001|    John Doe|        HR| 2021-01-15| 55000|
|      1005|David Wilson|        IT| 2021-06-25| 58000|
+----------+------------+----------+-----------+------+



In [0]:
#3. Data Aggregation
# changing salary column into int
df_employee = df_employee.withColumn("Salary", col("Salary").cast("int"))

# Find the average salary by Department
df_avg_salary = df_employee.groupBy("Department").avg("Salary")
df_avg_salary.show()

# Count the number of employees in each Department
df_employee_count = df_employee.groupBy("Department").count()
df_employee_count.show()


+----------+------------------+
|Department|       avg(Salary)|
+----------+------------------+
|        HR|           54000.0|
|   Finance|           68500.0|
|        IT|61666.666666666664|
+----------+------------------+

+----------+-----+
|Department|count|
+----------+-----+
|        HR|    3|
|   Finance|    2|
|        IT|    3|
+----------+-----+



In [0]:
# 4. Write the Data to CSV
# Save the cleaned data to a new CSV file
df_cleaned_joining.write.csv("/Workspace/Users/azuser2115_mml.local@techademy.com/employee_data.txt", header=True)


WORKING WITH JSON DATA

In [0]:
# load data
data = "file:/Workspace/Users/azuser2115_mml.local@techademy.com/product_data.json"

from pyspark.sql.types import StructType,StructField,StringType,IntegerType
schema = StructType([
    StructField("ProductID", IntegerType(), True),
    StructField("ProductName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Stock", IntegerType(), True)
])
df_product = spark.read.schema(schema).json(data)

# Display the first 10 rows and inspect the schema
df_product.show(10)
df_product.printSchema()


+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      101|     Laptop|Electronics| 1200|   35|
|      102| Smartphone|Electronics|  800|   80|
|      103| Desk Chair|  Furniture|  150|   60|
|      104|    Monitor|Electronics|  300|   45|
|      105|       Desk|  Furniture|  350|   25|
+---------+-----------+-----------+-----+-----+

root
 |-- ProductID: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Stock: integer (nullable = true)



In [0]:
# 2. Data Cleaning
# Remove rows where Stock is less than 30
df_cleaned_stock = df_product.filter(col('Stock') >= 30)

# Filter the products that belong to the "Electronics" category
df_electronics = df_cleaned_stock.filter(col('Category') == "Electronics")

# Show the cleaned DataFrame
df_electronics.show()


+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      101|     Laptop|Electronics| 1200|   35|
|      102| Smartphone|Electronics|  800|   80|
|      104|    Monitor|Electronics|  300|   45|
+---------+-----------+-----------+-----+-----+



In [0]:
# 3. Data Aggregation
# Calculate the total stock for products in the "Furniture" category
total_stock_furniture = df_product.filter(col('Category') == "Furniture").agg({"Stock": "sum"})
total_stock_furniture.show()

# Find the average price of all products
avg_price = df_product.agg({"Price": "avg"})
avg_price.show()


+----------+
|sum(Stock)|
+----------+
|        85|
+----------+

+----------+
|avg(Price)|
+----------+
|     560.0|
+----------+



In [0]:
# 4. Write the Data to JSON
# Save the cleaned and aggregated data to a new JSON file
df_electronics.write.json("file:/Workspace/Users/azuser2115_mml.local@techademy.com/data.txt",mode="overwrite")


**WORKING WITH DELTA TABLES**

In [0]:
# 1. Convert CSV and JSON Data to Delta Format
# Convert the employee_data CSV to Delta format
df_employee.write.format("delta").mode("overwrite").save("/Workspace/Users/azuser2115_mml.local@techademy.com/delta_employee_data")

# Convert the product_data JSON to Delta format
df_product.write.format("delta").mode("overwrite").save("/Workspace/Users/azuser2115_mml.local@techademy.com/delta_product_data")


In [0]:
# 2. Register Delta Tables

delta_employee = spark.read.format("delta").load("/Workspace/Users/azuser2115_mml.local@techademy.com/delta_employee_data")
delta_product = spark.read.format("delta").load("/Workspace/Users/azuser2115_mml.local@techademy.com/delta_product_data")

# register delta tables as sql tables
delta_employee.write.saveAsTable("employee_delta_table")
delta_product.write.saveAsTable("product_delta_table")

In [0]:
delta_employee.show()
delta_product.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      101|     Laptop|Electronics| 1200|   35|
|      102| Smartphone|Electronics|  800|   80|
|      103| Desk Chair|  Furniture|  150|   60|
|      104|    Monitor|Elec

In [0]:
# 3. Data Modifications with Delta Tables
# Update operation: Increase the salary by 5% for all employees in the IT department
spark.sql("""
    UPDATE employee_delta_table
    SET Salary = Salary * 1.05
    WHERE Department = 'IT'
""")

# Delete operation: Delete products where the stock is less than 40
spark.sql("""
    DELETE FROM product_delta_table
    WHERE Stock < 40
""")
spark.sql("select * from employee_delta_table").show()
spark.sql("select * from product_delta_table").show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1002|   Jane Smith|        IT| 2020-03-10| 68355|
|      1005| David Wilson|        IT| 2021-06-25| 63945|
|      1007| James Miller|        IT| 2019-08-14| 71662|
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      102| Smartphone|Electronics|  800|   80|
|      103| Desk Chair|  Furniture|  150|   60|
|      104|    Monitor|Electronics|  300|   45|
+---------+-----------+----

In [0]:
# Query the employee Delta table to find employees in the Finance department
df_finance_employees = spark.sql("SELECT * FROM employee_delta_table WHERE Department = 'Finance'")
df_finance_employees.show()

# Query the product Delta table to find all products in the Electronics category with a price greater than 500
df_electronics_expensive = spark.sql("SELECT * FROM product_delta_table WHERE Category = 'Electronics' AND Price > 500")
df_electronics_expensive.show()


+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
+----------+-------------+----------+-----------+------+

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      102| Smartphone|Electronics|  800|   80|
+---------+-----------+-----------+-----+-----+

