# Read Data

In [0]:
path = 'dbfs:/FileStore/employee_data.csv'

In [0]:
df = spark.read.format('csv').option('header','true').option('inferSchema','true').load(path)

In [0]:
display(df.limit(5))

10,295847284,adventure-works\ken0,NULL3,NULL4,Chief Executive Officer,1969-01-29,S,M,2009-01-14,110,99,69,113,F01251E5-96A3-448D-981E-0F99D789110D,2014-06-30 00:00:00.000
2,245797967,adventure-works\terri0,0x58,1,Vice President of Engineering,1971-08-01,S,F,2008-01-31,1,1,20,1,45E8F437-670D-4409-93CB-F9424A40D6EE,2014-06-30T00:00:00.000+0000
3,509647174,adventure-works\roberto0,0x5AC0,2,Engineering Manager,1974-11-12,M,M,2007-11-11,1,2,21,1,9BBBFB2C-EFBB-4217-9AB7-F97689328841,2014-06-30T00:00:00.000+0000
4,112457891,adventure-works\rob0,0x5AD6,3,Senior Tool Designer,1974-12-23,S,M,2007-12-05,0,48,80,1,59747955-87B8-443F-8ED4-F8AD3AFDF3A9,2014-06-30T00:00:00.000+0000
5,695256908,adventure-works\gail0,0x5ADA,3,Design Engineer,1952-09-27,M,F,2008-01-06,1,5,22,1,EC84AE09-F9B8-4A15-B4A9-6CCBAB919B08,2014-06-30T00:00:00.000+0000
6,998320692,adventure-works\jossef0,0x5ADE,3,Design Engineer,1959-03-11,M,M,2008-01-24,1,6,23,1,E39056F1-9CD5-478D-8945-14ACA7FBDCDD,2014-06-30T00:00:00.000+0000


# Read Data by adding Headers

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:

columns=StructType((
                StructField('BusinessEntityID',IntegerType(),True),
                StructField('NationalIDNumber',IntegerType(),True),
                StructField('LoginID',StringType(),True),
                StructField('OrganizationNode',StringType(),True),
                StructField('OrganizationLevel',IntegerType(),True),
                StructField('JobTitle',StringType(),True),
                StructField('BirthDate',DateType(),True),
                StructField('MaritalStatus',StringType(),True),
                StructField('Gender',StringType(),True),
                StructField('HireDate',TimestampType(),True),
                StructField('SalariedFlag',IntegerType(),True),
                StructField('VacationHours',IntegerType(),True),
                StructField('SickLeaveHours',IntegerType(),True),
                StructField('CurrentFlag',IntegerType(),True),
                StructField('rowguid',StringType(),True),
                StructField('ModifiedDate',TimestampType(),True)
            ))


In [0]:
df1 = spark.read.csv(path,schema=columns)
display(df1.limit(5))

BusinessEntityID,NationalIDNumber,LoginID,OrganizationNode,OrganizationLevel,JobTitle,BirthDate,MaritalStatus,Gender,HireDate,SalariedFlag,VacationHours,SickLeaveHours,CurrentFlag,rowguid,ModifiedDate
1,295847284,adventure-works\ken0,,,Chief Executive Officer,1969-01-29,S,M,2009-01-14T00:00:00.000+0000,1,99,69,1,F01251E5-96A3-448D-981E-0F99D789110D,2014-06-30T00:00:00.000+0000
2,245797967,adventure-works\terri0,0x58,1.0,Vice President of Engineering,1971-08-01,S,F,2008-01-31T00:00:00.000+0000,1,1,20,1,45E8F437-670D-4409-93CB-F9424A40D6EE,2014-06-30T00:00:00.000+0000
3,509647174,adventure-works\roberto0,0x5AC0,2.0,Engineering Manager,1974-11-12,M,M,2007-11-11T00:00:00.000+0000,1,2,21,1,9BBBFB2C-EFBB-4217-9AB7-F97689328841,2014-06-30T00:00:00.000+0000
4,112457891,adventure-works\rob0,0x5AD6,3.0,Senior Tool Designer,1974-12-23,S,M,2007-12-05T00:00:00.000+0000,0,48,80,1,59747955-87B8-443F-8ED4-F8AD3AFDF3A9,2014-06-30T00:00:00.000+0000
5,695256908,adventure-works\gail0,0x5ADA,3.0,Design Engineer,1952-09-27,M,F,2008-01-06T00:00:00.000+0000,1,5,22,1,EC84AE09-F9B8-4A15-B4A9-6CCBAB919B08,2014-06-30T00:00:00.000+0000


In [0]:
df1.printSchema()

root
 |-- BusinessEntityID: integer (nullable = true)
 |-- NationalIDNumber: integer (nullable = true)
 |-- LoginID: string (nullable = true)
 |-- OrganizationNode: string (nullable = true)
 |-- OrganizationLevel: integer (nullable = true)
 |-- JobTitle: string (nullable = true)
 |-- BirthDate: date (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HireDate: timestamp (nullable = true)
 |-- SalariedFlag: integer (nullable = true)
 |-- VacationHours: integer (nullable = true)
 |-- SickLeaveHours: integer (nullable = true)
 |-- CurrentFlag: integer (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Save Dataframe as a Table and use SQL to query the data

### How many male and female employees under each Job

In [0]:
df1.createTempView('emp')

In [0]:
%sql
select JobTitle, count(BusinessEntityID) as Total_Employees,
count(case Gender When 'M' then 1 end) as Males,
count(case Gender When 'F' then 1 end) as Females
from emp
group by JobTitle 

JobTitle,Total_Employees,Males,Females
Research and Development Manager,2,2,0
Finance Manager,1,0,1
Document Control Manager,1,1,0
Production Supervisor - WC45,3,2,1
Senior Design Engineer,1,1,0
Production Control Manager,1,1,0
Vice President of Production,1,1,0
Production Supervisor - WC30,3,2,1
Marketing Manager,1,1,0
Quality Assurance Manager,1,1,0


In [0]:
gender_dist = df1.groupBy('JobTitle').agg(
    count('BusinessEntityID').alias('Total_Employees'),
    count(when(col('Gender') == 'M', 1)).alias('Male_employees'),
    count(when(col('Gender') == 'F', 1)).alias('Female_employees')
)
display(gender_dist.limit(5))

JobTitle,Total_Employees,Male_employees,Female_employees
Research and Development Manager,2,2,0
Finance Manager,1,0,1
Document Control Manager,1,1,0
Production Supervisor - WC45,3,2,1
Senior Design Engineer,1,1,0


### How many employees were hired in each year ?

In [0]:
%sql
select year(HireDate) as YOJ, count(BusinessEntityID) as total_employees from emp
group by year(HireDate)
order by total_employees DESC

YOJ,total_employees
2009,148
2008,74
2010,38
2011,16
2007,6
2012,4
2013,3
2006,1


In [0]:
Yearly_Hiring_Dist = df1.withColumn('YOJ', year('HireDate'))\
                        .groupBy('YOJ')\
                        .agg(count('BusinessEntityID').alias('Total_Employees'))\
                        .orderBy('Total_Employees',ascending=False)

display(Yearly_Hiring_Dist)

YOJ,Total_Employees
2009,148
2008,74
2010,38
2011,16
2007,6
2012,4
2013,3
2006,1


Databricks visualization. Run in Databricks to view.

### Display top 5 employees with High SickLeaveHours

In [0]:
%sql
select * from
(select BusinessEntityID, SickLeaveHours, dense_rank() over(order by (SickLeaveHours) desc) as rn from emp) x
where x.rn <= 5

BusinessEntityID,SickLeaveHours,rn
4,80,1
1,69,2
88,69,2
92,69,2
116,69,2
117,69,2
124,69,2
89,68,3
90,68,3
113,68,3


In [0]:
wind_fn = Window.orderBy(col('SickLeaveHours').desc())
max_SLH = df1.select('BusinessEntityID', 'SickLeaveHours', dense_rank().over(wind_fn).alias('rnk'))
max_SLH.display()

BusinessEntityID,SickLeaveHours,rnk
4,80,1
1,69,2
88,69,2
92,69,2
116,69,2
117,69,2
124,69,2
89,68,3
90,68,3
113,68,3


In [0]:
%sql
select SickLeaveHours,count(BusinessEntityID) as total_employees from emp
group by SickLeaveHours
having total_employees = 5

SickLeaveHours,total_employees
34,5
27,5
35,5
25,5
32,5
36,5


### Filter Job Titles with some pattern

In [0]:
display(spark.sql("select JobTitle from emp").limit(5))

JobTitle
Chief Executive Officer
Vice President of Engineering
Engineering Manager
Senior Tool Designer
Design Engineer


### Display Maximum Vacation Hours

In [0]:
%sql
select JobTitle,year(HireDate) as YOJ,VacationHours from emp
where vacationHours > 90

JobTitle,YOJ,VacationHours
Chief Executive Officer,2009,99
Production Technician - WC10,2009,99
Production Technician - WC10,2010,96
Production Technician - WC10,2010,97
Production Technician - WC10,2010,95
Production Technician - WC10,2010,98
Production Technician - WC10,2009,93
Production Technician - WC10,2009,94
Production Technician - WC10,2010,92
Production Technician - WC10,2010,91


In [0]:
Max_Vac_hrs = df1.select('JobTitle',year('HireDate').alias('YOJ'),'vacationHours')\
    .filter( col('VacationHourS') > 90)
Max_Vac_hrs.display()

JobTitle,YOJ,vacationHours
Chief Executive Officer,2009,99
Production Technician - WC10,2009,99
Production Technician - WC10,2010,96
Production Technician - WC10,2010,97
Production Technician - WC10,2010,95
Production Technician - WC10,2010,98
Production Technician - WC10,2009,93
Production Technician - WC10,2009,94
Production Technician - WC10,2010,92
Production Technician - WC10,2010,91


### Where and Having in SQL

In [0]:
spark.sql("""select JobTitle, count(BusinessEntityID) as Total_Employees from emp
where HireDate > '2009-12-31'
group by JobTitle
having count(BusinessEntityID) > 5""").display()

JobTitle,Total_Employees
Buyer,6
Sales Representative,14
Production Technician - WC10,7


In [0]:
emp_count = df1.filter( col('HireDate') > '2009-12-31')\
    .groupBy("JobTitle")\
    .agg( count(col('BusinessEntityID') ).alias("TotalEmployees"))\
    .filter(col('TotalEmployees') > 5)
emp_count.display()

JobTitle,TotalEmployees
Buyer,6
Sales Representative,14
Production Technician - WC10,7


### SaveAsTable

In [0]:
df2 = df1.filter( col('HireDate') > '2009-12-31')
df2.count()

Out[122]: 61

In [0]:
display(df2.limit(5))

BusinessEntityID,NationalIDNumber,LoginID,OrganizationNode,OrganizationLevel,JobTitle,BirthDate,MaritalStatus,Gender,HireDate,SalariedFlag,VacationHours,SickLeaveHours,CurrentFlag,rowguid,ModifiedDate
11,974026903,adventure-works\ovidiu0,0x5AE3,3,Senior Tool Designer,1978-01-17,S,M,2010-12-05T00:00:00.000+0000,0,7,23,1,F68C7C19-FAC1-438C-9BB7-AC33FCC341C3,2014-06-30T00:00:00.000+0000
13,486228782,adventure-works\janice0,0x5AE368,4,Tool Designer,1989-05-28,M,F,2010-12-23T00:00:00.000+0000,0,8,24,1,954B91B6-5AA7-48C2-8685-6E11C6E5C49A,2014-06-30T00:00:00.000+0000
14,42487730,adventure-works\michael8,0x5AE5,3,Senior Design Engineer,1979-06-16,S,M,2010-12-30T00:00:00.000+0000,1,3,21,1,46286CA4-46DD-4DDB-9128-85B67E98D1A9,2014-06-30T00:00:00.000+0000
15,56920285,adventure-works\sharon0,0x5AE7,3,Design Engineer,1961-05-02,M,F,2011-01-18T00:00:00.000+0000,1,4,22,1,54F2FDC0-87C4-4065-A7A8-9AC8EA624235,2014-06-30T00:00:00.000+0000
18,222969461,adventure-works\john5,0x6B40,2,Marketing Specialist,1978-03-06,S,M,2011-02-07T00:00:00.000+0000,0,48,44,1,64730415-1F58-4E5B-8FA8-5E4DAEBA53B4,2014-06-30T00:00:00.000+0000


In [0]:
df2.write.option('header','true').option('inferSchema','true').mode('overwrite').saveAsTable('tbl_employee')

In [0]:
%sql
select count(*) as total_records from tbl_employee

total_records
61
