In [0]:
#select all data from global table "loan_data" located in default database (withib databricks hive metastore)
loan_df = spark.sql('select * from default.loan_data')

In [0]:
loan_df.display()

ID,Default,Loan_type,Gender,Age,Degree,Income,Credit_score,Loan_length,Signers,Citizenship
1,0,Car,Female,30,HS,114885,641,0,2,Citizen
2,0,Home,Female,43,HS,95770,534,7,2,Citizen
3,0,Home,Male,39,HS,94220,558,5,2,Citizen
4,0,Home,Male,39,College,58946,622,7,2,Citizen
5,0,Car,Female,42,HS,79754,702,2,1,Citizen
6,0,Car,Female,57,Graduate,119627,624,3,2,Non-citizen
7,0,Car,Male,41,HS,78765,594,3,2,Citizen
8,0,Car,Female,53,HS,123167,652,2,2,Non-citizen
9,0,Car,Female,44,HS,116175,694,3,2,Citizen
10,0,Car,Female,57,College,111085,586,3,2,Citizen


there are 2 ways to query data: 
    - spark.sql() #spark context
    - sqlContext.sql() #to use sqlContext

In [0]:
#to access the table located in the default database no need to specify database name in prefix
sqlContext.sql('SELECT * FROM loan_data').display()

ID,Default,Loan_type,Gender,Age,Degree,Income,Credit_score,Loan_length,Signers,Citizenship
1,0,Car,Female,30,HS,114885,641,0,2,Citizen
2,0,Home,Female,43,HS,95770,534,7,2,Citizen
3,0,Home,Male,39,HS,94220,558,5,2,Citizen
4,0,Home,Male,39,College,58946,622,7,2,Citizen
5,0,Car,Female,42,HS,79754,702,2,1,Citizen
6,0,Car,Female,57,Graduate,119627,624,3,2,Non-citizen
7,0,Car,Male,41,HS,78765,594,3,2,Citizen
8,0,Car,Female,53,HS,123167,652,2,2,Non-citizen
9,0,Car,Female,44,HS,116175,694,3,2,Citizen
10,0,Car,Female,57,College,111085,586,3,2,Citizen


In [0]:
#let's proceed with the spark context

spark.sql('select Loan_type', Gender, Income, Credit_score, from loan_data').display()

In [0]:
#span multiple line query
spark.sql("""select Loan_type, Gender, Income, Credit_score
             from loan_data 
             limit 3""")

In [0]:
#peform filtering in sql
spark.sql("""select Loan_type, Gender, Income, Credit_score
             from loan_data 
             where Income > 100000 
             limit 10""").display()

Loan_type,Gender,Income,Credit_score
Car,Female,114885,641
Car,Female,119627,624
Car,Female,123167,652
Car,Female,116175,694
Car,Female,111085,586
Car,Male,118613,622
Car,Female,113281,577
Car,Male,126260,638
Car,Male,118155,730
Car,Female,132630,625


In [0]:
spark.sql("""select Loan_type, Gender, Income, Credit_score
             from loan_data 
             where Income > 100000 and Loan_type = "Home"
             """).display()

Loan_type,Gender,Income,Credit_score
Home,Female,118848,701
Home,Female,109327,682
Home,Male,104234,586
Home,Male,109333,586
Home,Male,113200,573
Home,Female,127891,596
Home,Male,120250,590
Home,Male,116594,650
Home,Male,104788,535
Home,Male,113556,613


In [0]:
#use alias
spark.sql("""select Loan_type, Gender, Income, Credit_score as Score
             from loan_data 
             where Income < 100000 and Loan_type ="Home" 
             """).display()

Loan_type,Gender,Income,Score
Home,Female,95770,534
Home,Male,94220,558
Home,Male,58946,622
Home,Female,93253,580
Home,Male,99880,551
Home,Male,90710,667
Home,Female,75956,592
Home,Male,80480,689
Home,Female,86997,571
Home,Male,98343,686


In [0]:
# aggregation functions
spark.sql("""select count(*) as count 
            from loan_data 
            where Credit_score > 600
            """).display()

count
2429


In [0]:
#remember to select all aggregated values (here - select column...group by column)
spark.sql("""select Degree, avg(Income) 
             from loan_data 
             group by Degree
             """).display()

Degree,avg(Income)
HS,94267.68294292311
Graduate,103051.82710280374
College,100442.07071622848


In [0]:
spark.sql("""select Default, count(*), avg(Income) 
            from loan_data 
            group by Default
            """).display()

Default,count(1),avg(Income)
0,4054,95354.14183522448
1,294,108844.731292517


In [0]:
spark.sql("""select Degree, avg(Income) as Avg_income 
            from loan_data 
            group by Degree 
            order by Avg_income 
            """).display()

Degree,Avg_income
HS,94267.68294292311
College,100442.07071622848
Graduate,103051.82710280374


In [0]:
spark.sql("""select Degree, avg(Income) as Avg_income 
            from loan_data 
            group by Degree 
            order by Avg_income desc
            """).display()

Degree,Avg_income
Graduate,103051.82710280374
College,100442.07071622848
HS,94267.68294292311


In [0]:
# group by ... having 
spark.sql("""select Degree, avg(Income) as Avg_income 
            from loan_data 
            group by Degree 
            having Avg_income > 100000 
            order by Avg_income desc 
            """).display()

Degree,Avg_income
Graduate,103051.82710280374
College,100442.07071622848


In [0]:
# you can apply spark dataframe queries in the result of sql query:
loan_data_subset_df = spark.sql('select trim(Loan_type) as Type, Gender, Age, Income from loan_data')
loan_data_subset_df.filter('Type = "Car"')\
                   .display()

Type,Gender,Age,Income
Car,Female,30,114885
Car,Female,42,79754
Car,Female,57,119627
Car,Male,41,78765
Car,Female,53,123167
Car,Female,44,116175
Car,Female,57,111085
Car,Male,50,54387
Car,Male,39,89468
Car,Male,52,118613


In [0]:
# we can chain operations abowe combining sql and dataframe queries:
spark.sql('select trim(Loan_type) as Type, Gender, Age, Income from loan_data')\
        .groupby('Type')\
        .agg({'Age':'avg'})\
        .display()

Type,avg(Age)
Home,40.46546762589928
Car,43.2866801893171


In [0]:
# local table (or temporary view) is accessable just in current notebook in current session (it's not stored in hive database)
loan_data_subset = spark.sql("""select trim(Loan_type) as Type, trim(Gender) as Gender, Age, Income, Credit_score from loan_data""")
loan_data_subset.createOrReplaceTempView('loan_data_subset')

In [0]:
# after view is created we can run queries on it as usual
spark.sql('select * from loan_data_subset').display()

Type,Gender,Age,Income,Credit_score
Car,Female,30,114885,641
Home,Female,43,95770,534
Home,Male,39,94220,558
Home,Male,39,58946,622
Car,Female,42,79754,702
Car,Female,57,119627,624
Car,Male,41,78765,594
Car,Female,53,123167,652
Car,Female,44,116175,694
Car,Female,57,111085,586


In [0]:
spark.sql('select * from loan_data_subset where Gender = "Female" and Age <30').display()

Type,Gender,Age,Income,Credit_score
Home,Female,26,95496,649
Car,Female,29,45103,585
Car,Female,29,84191,588
Car,Female,28,95128,586
Home,Female,27,108001,626
Car,Female,29,114142,554
Home,Female,28,81821,671
Home,Female,29,95390,563
Home,Female,29,96724,631
Home,Female,23,89663,562


In [0]:
spark.sql('select Gender, avg(Income) from loan_data_subset group by Gender').display()

Gender,avg(Income)
Female,100946.5860306644
Male,93080.44221105528


In [0]:
#write out from the TempView to the global table
spark.sql('select * from loan_data_subset where Gender = "Female" and Age <30')\
     .write.saveAsTable("loan_data_females_under_30")

In [0]:
# quey the newly created global table
spark.sql('select * from loan_data_females_under_30').display()

Type,Gender,Age,Income,Credit_score
Home,Female,26,95496,649
Home,Female,27,108001,626
Home,Female,28,81821,671
Home,Female,29,95390,563
Home,Female,29,96724,631
Home,Female,23,89663,562
Home,Female,28,73110,542
Home,Female,29,91882,514
Home,Female,29,65475,608
Home,Female,29,100888,583


In [0]:
# use LIKE operator (similar to regular expressions). Overwrite table based on selected criteria
spark.sql('select * from loan_data_subset where Gender = "Female" and Age < 30 and Type like "%om%"')\
     .write.mode("overwrite")\
     .saveAsTable("loan_data_females_under_30")

In [0]:
#check the result of the query above
spark.sql('select * from loan_data_females_under_30').display()

Type,Gender,Age,Income,Credit_score
Home,Female,26,95496,649
Home,Female,27,108001,626
Home,Female,28,81821,671
Home,Female,29,95390,563
Home,Female,29,96724,631
Home,Female,23,89663,562
Home,Female,28,73110,542
Home,Female,29,91882,514
Home,Female,29,65475,608
Home,Female,29,100888,583


In [0]:
# perform partitioning (the last query was performed as 1 job in single partition)

spark.sql('select * from loan_data_subset where Gender = "Female" and Age <30')\
     .write\
     .partitionBy("Type")\
     .saveAsTable("partitioned_loan_data_females_under_30")

In [0]:
# take a closer look on how does partition work. Spark runs separate paralell job to process each partition (in our case there are 2 partitions for Home 
#and Car loans) and 2 jobs were created
spark.sql('select * from partitioned_loan_data_females_under_30').display()

Type,Gender,Age,Income,Credit_score
Home,Female,26,95496,649
Home,Female,27,108001,626
Home,Female,28,81821,671
Home,Female,29,95390,563
Home,Female,29,96724,631
Home,Female,23,89663,562
Home,Female,28,73110,542
Home,Female,29,91882,514
Home,Female,29,65475,608
Home,Female,29,100888,583
