In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=38c7c58d632a07825f810d5dd4e914867bbacf3a4916d4ea5092f004f7881162
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


# **ETL EXERCISE**

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round, mean

# Initialize Spark session
spark = SparkSession.builder \
    .appName("EmployeeSalaryETL") \
    .getOrCreate()

# Load the employee data from a CSV file
input_path = "/content/sample_data/people.txt"
df = spark.read.csv(input_path, header=True, inferSchema=True)

# Filter employees aged 30 and above
filtered_df = df.filter(col("age") >= 30)

# Calculate the salary with a 10% bonus
transformed_df = filtered_df.withColumn(
    "salary_with_bonus", round(col("salary") * 1.10, 2)
)

# Show the transformed dataframe
transformed_df.show()

# Calculate average salary by gender
average_salary_by_gender = transformed_df.groupBy("gender").agg(
    round(mean("salary"), 2).alias("average_salary")
)

# Show average salary by gender
average_salary_by_gender.show()

# Define output path for the Parquet file
output_path = "path/to/transformed_employee_data.parquet"

# Save the transformed DataFrame to a Parquet file
transformed_df.write.parquet(output_path, mode="overwrite")

# Convert average salary by gender to Pandas DataFrame for reporting
summary_report = average_salary_by_gender.toPandas()

# Display summary report
print("Average Salary by Gender:")
print(summary_report)

# save the summary report to a CSV file
summary_report.to_csv("path/to/average_salary_by_gender.csv", index=False)


+-----+---+------+--------+-----------------+
| name|age|gender|  salary|salary_with_bonus|
+-----+---+------+--------+-----------------+
| Jane| 32| Femal|72000   |          79200.0|
| Mike| 45|  Male|84000   |          92400.0|
| Alex| 36|  Male|  67000 |          73700.0|
+-----+---+------+--------+-----------------+

+------+--------------+
|gender|average_salary|
+------+--------------+
| Femal|       72000.0|
|  Male|       75500.0|
+------+--------------+

Average Salary by Gender:
  gender  average_salary
0  Femal         72000.0
1   Male         75500.0
