In [0]:
#Dataset is downloaded from kaggle:
#(https://www.kaggle.com/datasets/mohithsairamreddy/salary-data)

In [0]:
### Extract
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ETL Pipeline").getOrCreate()
df = spark.read.table("salary_data_5_csv")

In [0]:
df.show()

+---+------+---------------+--------------------+-------------------+------+
|Age|Gender|Education Level|           Job Title|Years of Experience|Salary|
+---+------+---------------+--------------------+-------------------+------+
| 32|  Male|     Bachelor's|   Software Engineer|                  5| 90000|
| 28|Female|       Master's|        Data Analyst|                  3| 65000|
| 45|  Male|            PhD|      Senior Manager|                 15|150000|
| 36|Female|     Bachelor's|     Sales Associate|                  7| 60000|
| 52|  Male|       Master's|            Director|                 20|200000|
| 29|  Male|     Bachelor's|   Marketing Analyst|                  2| 55000|
| 42|Female|       Master's|     Product Manager|                 12|120000|
| 31|  Male|     Bachelor's|       Sales Manager|                  4| 80000|
| 26|Female|     Bachelor's|Marketing Coordin...|                  1| 45000|
| 38|  Male|            PhD|    Senior Scientist|                 10|110000|

In [0]:
### Transform

In [0]:
df.describe().show()

+-------+-----------------+------+---------------+---------------+-------------------+------------------+
|summary|              Age|Gender|Education Level|      Job Title|Years of Experience|            Salary|
+-------+-----------------+------+---------------+---------------+-------------------+------------------+
|  count|             6702|  6702|           6701|           6702|               6701|              6699|
|   mean|33.62085944494181|  null|           null|           null|  8.094687360095508|115326.96477086132|
| stddev|7.614632626251299|  null|           null|           null|  6.059003056634108| 52786.18391068295|
|    min|               21|Female|     Bachelor's|Account Manager|                  0|            100000|
|    max|               62| Other|            phD|  Web Developer|                  9|             99747|
+-------+-----------------+------+---------------+---------------+-------------------+------------------+



In [0]:
from pyspark.sql.functions import col, count, when
from pyspark.sql.functions import col
from pyspark.sql.functions import when


In [0]:

# count na value of each column
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+------+---------------+---------+-------------------+------+
|Age|Gender|Education Level|Job Title|Years of Experience|Salary|
+---+------+---------------+---------+-------------------+------+
|  2|     2|              3|        2|                  3|     5|
+---+------+---------------+---------+-------------------+------+



In [0]:
# drop all rows that have na value
df = df.dropna()

In [0]:
#check again

df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+------+---------------+---------+-------------------+------+
|Age|Gender|Education Level|Job Title|Years of Experience|Salary|
+---+------+---------------+---------+-------------------+------+
|  0|     0|              0|        0|                  0|     0|
+---+------+---------------+---------+-------------------+------+



In [0]:
# drop duplicate row

df = df.dropDuplicates()

In [0]:
#Check value of non-numeric column

df.groupBy('Education Level').count().orderBy(col('count').desc()).show()

+-----------------+-----+
|  Education Level|count|
+-----------------+-----+
|Bachelor's Degree|  506|
|  Master's Degree|  446|
|              PhD|  340|
|       Bachelor's|  262|
|         Master's|  122|
|      High School|  110|
|              phD|    1|
+-----------------+-----+



In [0]:
#There are some variables repeated again we the same values in a differernt spelling, now we will replace it with the matching values.
df = df.withColumn('Education Level', 
                             when(col('Education Level') == "Bachelor's Degree", "Bachelor")
                             .when(col('Education Level') == "Bachelor's", "Bachelor")
                             .when(col('Education Level') == "Master's Degree", "Master")
                             .when(col('Education Level') == "Master's", "Master")
                             .when(col('Education Level') == "phD", "PhD")
                             .otherwise(col('Education Level')))


In [0]:
df.groupBy('Education Level').count().orderBy(col('count').desc()).show()

+---------------+-----+
|Education Level|count|
+---------------+-----+
|       Bachelor|  768|
|         Master|  568|
|            PhD|  341|
|    High School|  110|
+---------------+-----+



In [0]:
df.groupBy('Gender').count().orderBy(col('count').desc()).show()

+------+-----+
|Gender|count|
+------+-----+
|  Male|  966|
|Female|  814|
| Other|    7|
+------+-----+



In [0]:
df.groupBy('Job Title').count().orderBy(col('count').desc()).show()

+--------------------+-----+
|           Job Title|count|
+--------------------+-----+
|Software Engineer...|  127|
| Full Stack Engineer|  122|
|Senior Software E...|   96|
|Senior Project En...|   95|
|  Back end Developer|   81|
|      Data Scientist|   80|
|   Software Engineer|   78|
| Front end Developer|   71|
|   Marketing Manager|   55|
|     Product Manager|   53|
|        Data Analyst|   51|
|       Web Developer|   34|
|   Financial Manager|   28|
|      Director of HR|   27|
|Director of Marke...|   27|
|Marketing Coordin...|   26|
|Junior Sales Asso...|   25|
|Content Marketing...|   24|
|  Software Developer|   22|
|  Operations Manager|   22|
+--------------------+-----+
only showing top 20 rows



In [0]:
df.describe().show()

+-------+------------------+------+---------------+---------------+-------------------+------------------+
|summary|               Age|Gender|Education Level|      Job Title|Years of Experience|            Salary|
+-------+------------------+------+---------------+---------------+-------------------+------------------+
|  count|              1787|  1787|           1787|           1787|               1787|              1787|
|   mean|35.139899272523785|  null|           null|           null|  9.156127588136542|113184.65976496923|
| stddev| 8.213044590504502|  null|           null|           null|  6.844924407705756| 51596.53676609346|
|    min|                21|Female|       Bachelor|Account Manager|                  0|            100000|
|    max|                62| Other|            PhD|  Web Developer|                  9|             99747|
+-------+------------------+------+---------------+---------------+-------------------+------------------+



In [0]:
### Load
import datetime

driver = "org.postgresql.Driver"
url = "jdbc:postgresql://dpg-cgcb7364dad7accgg5bg-a.frankfurt-postgres.render.com/kietitmo_db"
table = "practika"
user = "kietitmo_db_user"
password = "Ph1vwQz38o0LsO3yIRIIjdXzsHdsYodF"

# Get the current date
current_date = datetime.datetime.now().strftime("%Y%m%d")

# Construct the table name with the current date
table_name = f"practika_{current_date}"

# Write the DataFrame to the dynamically named table
df.write.format("jdbc") \
  .option("driver", driver) \
  .option("url", url) \
  .option("dbtable", table_name) \
  .mode("append") \
  .option("user", user) \
  .option("password", password) \
  .save()