In [0]:
# create a new directory in DBFS, which will be used as source for streaming data
#than upload files into it through UI
dbutils.fs.mkdirs("dbfs:/FileStore/datasets/attr_source_stream")

In [0]:
# load data as stream to spark dataframe
attr_stream_data = spark.readStream.format("cloudFiles") \
                        .option("cloudFiles.format", "csv") \
                        .option("cloudFiles.schemaLocation", 
                               "dbfs:/FileStore/datasets/attr_source_stream") \
                        .option("cloudFiles.schemaHints", 
                               """Age int, DailyRate int, DistanceFromHome int, HourlyRate int, JobLevel int, JobSatisfaction int, 
                               MonthlyIncome int, PercentSalaryHike int, PerformanceRating int, YearsSinceLastPromotion int, 
                               YearsWithCurrManager int""") \
                        .load("dbfs:/FileStore/datasets/attr_source_stream")

attr_stream_data.display()

Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,EducationField,Gender,HourlyRate,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,OverTime,PercentSalaryHike,PerformanceRating,YearsSinceLastPromotion,YearsWithCurrManager,_rescued_data
41,Yes,Travel_Rarely,1102,Sales,1,Life Sciences,Female,94,2,Sales Executive,4,Single,5993,Yes,11,3,0,5,
49,No,Travel_Frequently,279,Research & Development,8,Life Sciences,Male,61,2,Research Scientist,2,Married,5130,No,23,4,1,7,
37,Yes,Travel_Rarely,1373,Research & Development,2,Other,Male,92,1,Laboratory Technician,3,Single,2090,Yes,15,3,0,0,
33,No,Travel_Frequently,1392,Research & Development,3,Life Sciences,Female,56,1,Research Scientist,3,Married,2909,Yes,11,3,3,0,
27,No,Travel_Rarely,591,Research & Development,2,Medical,Male,40,1,Laboratory Technician,2,Married,3468,No,12,3,2,2,
32,No,Travel_Frequently,1005,Research & Development,2,Life Sciences,Male,79,1,Laboratory Technician,4,Single,3068,No,13,3,3,6,
59,No,Travel_Rarely,1324,Research & Development,3,Medical,Female,81,1,Laboratory Technician,1,Married,2670,Yes,20,4,0,0,
30,No,Travel_Rarely,1358,Research & Development,24,Life Sciences,Male,67,1,Laboratory Technician,3,Divorced,2693,No,22,4,0,0,
38,No,Travel_Frequently,216,Research & Development,23,Life Sciences,Male,44,3,Manufacturing Director,3,Single,9526,No,21,4,1,8,
36,No,Travel_Rarely,1299,Research & Development,27,Medical,Male,94,2,Healthcare Representative,3,Married,5237,No,13,3,7,7,


Before we can run SQL queries on our stream dataframe, we need to create a table from data. Either global (through UI-by default) or local (TempView), that will be available just within current session and only for current notebook

In [0]:
# create a TempView
attr_stream_data.createOrReplaceTempView('attrition_table')

In [0]:
# now we can run a SQL query on created view, as usual table
spark.sql("""SELECT BusinessTravel, Department, Gender, DistanceFromHome, MonthlyIncome 
          FROM attrition_table WHERE MonthlyIncome > 3000  """).display()

BusinessTravel,Department,Gender,DistanceFromHome,MonthlyIncome
Travel_Rarely,Sales,Female,1,5993
Travel_Frequently,Research & Development,Male,8,5130
Travel_Rarely,Research & Development,Male,2,3468
Travel_Frequently,Research & Development,Male,2,3068
Travel_Frequently,Research & Development,Male,23,9526
Travel_Rarely,Research & Development,Male,27,5237
Travel_Rarely,Research & Development,Female,15,4193
Travel_Rarely,Research & Development,Female,21,9980
Travel_Rarely,Research & Development,Male,5,3298
Travel_Rarely,Sales,Female,2,15427


In [0]:
spark.sql("SELECT Gender, DistanceFromHome, JobSatisfaction FROM attrition_table WHERE DistanceFromHome >10 and JobSatisfaction < 3").display()

Gender,DistanceFromHome,JobSatisfaction
Male,16,2
Female,21,1
Female,16,1
Male,19,2
Male,11,1
Male,18,1
Female,23,1
Female,25,1
Male,11,2
Male,23,2


In [0]:
spark.sql("SELECT Gender, avg(YearsSinceLastPromotion), avg(JobSatisfaction) FROM attrition_table GROUP BY Gender").display()

Gender,avg(YearsSinceLastPromotion),avg(JobSatisfaction)
Female,2.739130434782609,2.608695652173913
Male,2.0486486486486486,2.913513513513513


Use magic %sql function to run SQL Query syntax (available in Databricks environment, not sure it works locally)

In [0]:
%sql
SELECT BusinessTravel, PercentSalaryHike, PerformanceRating 
FROM attrition_table 
WHERE PerformanceRating > 3

BusinessTravel,PercentSalaryHike,PerformanceRating
Travel_Frequently,23,4
Travel_Rarely,20,4
Travel_Rarely,22,4
Travel_Frequently,21,4
Travel_Rarely,23,4
Travel_Frequently,22,4
Travel_Frequently,23,4
Non-Travel,20,4
Travel_Rarely,21,4
Travel_Frequently,21,4


we just uploaded new data into the source directory

In [0]:
%sql
SELECT JobRole, avg(PercentSalaryHike), avg(PerformanceRating) FROM attrition_table GROUP BY JobRole

JobRole,avg(PercentSalaryHike),avg(PerformanceRating)
Sales Executive,15.56896551724138,3.189655172413793
Manufacturing Director,15.966666666666669,3.2
Laboratory Technician,15.14516129032258,3.2096774193548385
Sales Representative,16.615384615384617,3.3076923076923075
Healthcare Representative,15.818181818181818,3.1818181818181817
Research Scientist,15.753623188405795,3.188405797101449
Manager,14.608695652173912,3.130434782608696
Research Director,13.833333333333334,3.0
Human Resources,20.6,3.6
