ETL with Spark (Local)

In [28]:
from pyspark.sql import SparkSession
# from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, TimestampType

# import pyspark.sql.functions as F

In [29]:
import pandas as pd
import glob

In [30]:
p = glob.glob("data/*.csv")

In [31]:
p

['data/HRDataset_v14.csv']

In [32]:
data = "HRDataset_v14.csv"

In [33]:
spark = SparkSession.builder \
    .appName("ETL") \
    .getOrCreate()

In [34]:
data_folder = "data"

In [49]:
data = spark.read.option("header","true").option("multiline", "true").csv(data_folder)

In [50]:
data.show()

+--------------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+-----+----------+--------------------+-----+-----+--------+---+-----------+-------------------+--------------+--------------------+----------+-----------------+-----------------+--------------------+--------------------+---------------+---------+--------------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+
|       Employee_Name|EmpID|MarriedID|MaritalStatusID|GenderID|EmpStatusID|DeptID|PerfScoreID|FromDiversityJobFairID|Salary|Termd|PositionID|            Position|State|  Zip|     DOB|Sex|MaritalDesc|        CitizenDesc|HispanicLatino|            RaceDesc|DateofHire|DateofTermination|       TermReason|    EmploymentStatus|          Department|    ManagerName|ManagerID|   RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Da

In [51]:
data.printSchema()

root
 |-- Employee_Name: string (nullable = true)
 |-- EmpID: string (nullable = true)
 |-- MarriedID: string (nullable = true)
 |-- MaritalStatusID: string (nullable = true)
 |-- GenderID: string (nullable = true)
 |-- EmpStatusID: string (nullable = true)
 |-- DeptID: string (nullable = true)
 |-- PerfScoreID: string (nullable = true)
 |-- FromDiversityJobFairID: string (nullable = true)
 |-- Salary: string (nullable = true)
 |-- Termd: string (nullable = true)
 |-- PositionID: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- MaritalDesc: string (nullable = true)
 |-- CitizenDesc: string (nullable = true)
 |-- HispanicLatino: string (nullable = true)
 |-- RaceDesc: string (nullable = true)
 |-- DateofHire: string (nullable = true)
 |-- DateofTermination: string (nullable = true)
 |-- TermReason: string (nullable = true)
 |-

In [56]:
data.select("EmpID","Employee_Name").show()

+-----+--------------------+
|EmpID|       Employee_Name|
+-----+--------------------+
|10026| Adinolfi, Wilson  K|
|10084|Ait Sidi, Karthik...|
|10196|   Akinkuolie, Sarah|
|10088|        Alagbe,Trina|
|10069|    Anderson, Carol |
|10002|   Anderson, Linda  |
|10194|     Andreola, Colby|
|10062|         Athwal, Sam|
|10114|    Bachiochi, Linda|
|10250|  Bacong, Alejandro |
|10252|Baczenski, Rachael  |
|10242|     Barbara, Thomas|
|10012|    Barbossa, Hector|
|10265|Barone, Francesco  A|
|10066|       Barton, Nader|
|10061|       Bates, Norman|
|10023|    Beak, Kimberly  |
|10055| Beatrice, Courtney |
|10245|       Becker, Renee|
|10277|       Becker, Scott|
+-----+--------------------+
only showing top 20 rows



In [53]:
data.createOrReplaceTempView("staging_events")

In [55]:
table = spark.sql("""
    select
        *        
    from
        staging_events
""").show()

+--------------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+-----+----------+--------------------+-----+-----+--------+---+-----------+-------------------+--------------+--------------------+----------+-----------------+-----------------+--------------------+--------------------+---------------+---------+--------------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+
|       Employee_Name|EmpID|MarriedID|MaritalStatusID|GenderID|EmpStatusID|DeptID|PerfScoreID|FromDiversityJobFairID|Salary|Termd|PositionID|            Position|State|  Zip|     DOB|Sex|MaritalDesc|        CitizenDesc|HispanicLatino|            RaceDesc|DateofHire|DateofTermination|       TermReason|    EmploymentStatus|          Department|    ManagerName|ManagerID|   RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Da

In [83]:
table = spark.sql("""
    select
       EmpID
        , Employee_Name
        , Salary
        , Position
        , Sex
        , MaritalDesc
        , EmploymentStatus
        , Department
        , RecruitmentSource
        , PerformanceScore
        , Absences
        
    from
        staging_events
""")



In [84]:
table.show()

+-----+--------------------+------+--------------------+---+-----------+--------------------+--------------------+--------------------+----------------+--------+
|EmpID|       Employee_Name|Salary|            Position|Sex|MaritalDesc|    EmploymentStatus|          Department|   RecruitmentSource|PerformanceScore|Absences|
+-----+--------------------+------+--------------------+---+-----------+--------------------+--------------------+--------------------+----------------+--------+
|10026| Adinolfi, Wilson  K| 62506|Production Techni...| M |     Single|              Active|   Production       |            LinkedIn|         Exceeds|       1|
|10084|Ait Sidi, Karthik...|104437|             Sr. DBA| M |    Married|Voluntarily Termi...|               IT/IS|              Indeed|     Fully Meets|      17|
|10196|   Akinkuolie, Sarah| 64955|Production Techni...|  F|    Married|Voluntarily Termi...|   Production       |            LinkedIn|     Fully Meets|       3|
|10088|        Alagbe,Trina|

In [85]:
destination = "../employee_details"

In [86]:
table.write.mode("overwrite").option("header", "true").csv(destination)

In [92]:
table = spark.sql("""
    select
        EmpID
        , Employee_Name
        , Salary
        , Department
    from
        staging_events
""")


In [93]:
table.show()

+-----+--------------------+------+--------------------+
|EmpID|       Employee_Name|Salary|          Department|
+-----+--------------------+------+--------------------+
|10026| Adinolfi, Wilson  K| 62506|   Production       |
|10084|Ait Sidi, Karthik...|104437|               IT/IS|
|10196|   Akinkuolie, Sarah| 64955|   Production       |
|10088|        Alagbe,Trina| 64991|   Production       |
|10069|    Anderson, Carol | 50825|   Production       |
|10002|   Anderson, Linda  | 57568|   Production       |
|10194|     Andreola, Colby| 95660|Software Engineering|
|10062|         Athwal, Sam| 59365|   Production       |
|10114|    Bachiochi, Linda| 47837|   Production       |
|10250|  Bacong, Alejandro | 50178|               IT/IS|
|10252|Baczenski, Rachael  | 54670|   Production       |
|10242|     Barbara, Thomas| 47211|   Production       |
|10012|    Barbossa, Hector| 92328|               IT/IS|
|10265|Barone, Francesco  A| 58709|   Production       |
|10066|       Barton, Nader| 52

In [94]:
destination = "../salary"

In [95]:
table.write.mode("overwrite").option("header", "true").csv(destination)

In [96]:
table = spark.sql("""
    select
        EmpID
        , Employee_Name
        , Sex
        , Department
    from
        staging_events
""")


In [97]:
table.show()

+-----+--------------------+---+--------------------+
|EmpID|       Employee_Name|Sex|          Department|
+-----+--------------------+---+--------------------+
|10026| Adinolfi, Wilson  K| M |   Production       |
|10084|Ait Sidi, Karthik...| M |               IT/IS|
|10196|   Akinkuolie, Sarah|  F|   Production       |
|10088|        Alagbe,Trina|  F|   Production       |
|10069|    Anderson, Carol |  F|   Production       |
|10002|   Anderson, Linda  |  F|   Production       |
|10194|     Andreola, Colby|  F|Software Engineering|
|10062|         Athwal, Sam| M |   Production       |
|10114|    Bachiochi, Linda|  F|   Production       |
|10250|  Bacong, Alejandro | M |               IT/IS|
|10252|Baczenski, Rachael  |  F|   Production       |
|10242|     Barbara, Thomas| M |   Production       |
|10012|    Barbossa, Hector| M |               IT/IS|
|10265|Barone, Francesco  A| M |   Production       |
|10066|       Barton, Nader| M |   Production       |
|10061|       Bates, Norman|

In [98]:
destination = "../sex"

In [99]:
table.write.mode("overwrite").option("header", "true").csv(destination)

In [100]:
table = spark.sql("""
    select
        EmpID
        , Employee_Name
        , RecruitmentSource

    from
        staging_events
""")
destination = "../recruitment"
table.write.mode("overwrite").option("header", "true").csv(destination)

In [101]:
table.show()

+-----+--------------------+--------------------+
|EmpID|       Employee_Name|   RecruitmentSource|
+-----+--------------------+--------------------+
|10026| Adinolfi, Wilson  K|            LinkedIn|
|10084|Ait Sidi, Karthik...|              Indeed|
|10196|   Akinkuolie, Sarah|            LinkedIn|
|10088|        Alagbe,Trina|              Indeed|
|10069|    Anderson, Carol |       Google Search|
|10002|   Anderson, Linda  |            LinkedIn|
|10194|     Andreola, Colby|            LinkedIn|
|10062|         Athwal, Sam|   Employee Referral|
|10114|    Bachiochi, Linda|  Diversity Job Fair|
|10250|  Bacong, Alejandro |              Indeed|
|10252|Baczenski, Rachael  |  Diversity Job Fair|
|10242|     Barbara, Thomas|  Diversity Job Fair|
|10012|    Barbossa, Hector|  Diversity Job Fair|
|10265|Barone, Francesco  A|       Google Search|
|10066|       Barton, Nader|On-line Web appli...|
|10061|       Bates, Norman|       Google Search|
|10023|    Beak, Kimberly  |   Employee Referral|
