In [0]:
# loan_data is a previously created global table, located in default database
loan_data = spark.sql('SELECT * FROM loan_data')
loan_data.display()

ID,Default,Loan_type,Gender,Age,Degree,Income,Credit_score,Loan_length,Signers,Citizenship
1,0,Car,Female,30,HS,114885,641,0,2,Citizen
2,0,Home,Female,43,HS,95770,534,7,2,Citizen
3,0,Home,Male,39,HS,94220,558,5,2,Citizen
4,0,Home,Male,39,College,58946,622,7,2,Citizen
5,0,Car,Female,42,HS,79754,702,2,1,Citizen
6,0,Car,Female,57,Graduate,119627,624,3,2,Non-citizen
7,0,Car,Male,41,HS,78765,594,3,2,Citizen
8,0,Car,Female,53,HS,123167,652,2,2,Non-citizen
9,0,Car,Female,44,HS,116175,694,3,2,Citizen
10,0,Car,Female,57,College,111085,586,3,2,Citizen


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

window_spec = Window.partitionBy(loan_data['Degree'])\
                    .orderBy(loan_data['Income'].desc())

income_rank = rank().over(window_spec) #rank works just within partition, like all window functions

In [0]:
# it works basicaly like a function, we can use further
income_rank

In [0]:
income_rank_df = loan_data.select('ID', 'Degree', 'Income')\
                          .withColumn('Rank', income_rank) #we'll get 3 partitions: College, Graduate, HS
income_rank_df.display() #separate rank within every partition

ID,Degree,Income,Rank
1752,College,151256,1
853,College,150094,2
1376,College,149116,3
1803,College,148406,4
808,College,147798,5
3375,College,147138,6
3601,College,144433,7
3548,College,142906,8
173,College,142083,9
1008,College,141178,10


In [0]:
# return records with rank <= 3
income_rank_df.filter(income_rank_df.Rank <= 3).display()

ID,Degree,Income,Rank
1752,College,151256,1
853,College,150094,2
1376,College,149116,3
3311,Graduate,144703,1
1038,Graduate,142959,2
4241,Graduate,139008,3
4267,HS,155953,1
3278,HS,153485,2
2793,HS,144859,3


In [0]:
from pyspark.sql.functions import max

window_spec = Window.partitionBy(loan_data['Gender'])\
                    .orderBy(loan_data['Age'].desc())\
                    .rowsBetween(-1, 0)

compare_age = max(loan_data['Age']).over(window_spec)

In [0]:
loan_data.select('ID', 'Gender', 'Age').withColumn('compare_age', compare_age).display()

ID,Gender,Age,compare_age
106,Female,62,62
3227,Female,62,62
2550,Female,61,62
1757,Female,60,61
2105,Female,60,60
1900,Female,59,60
3933,Female,59,59
978,Female,58,59
1302,Female,58,58
1471,Female,58,58


In [0]:
# (-sys.maxsize, 0) return all rows before current row
import sys
from pyspark.sql.functions import avg

window_spec = Window.partitionBy(loan_data['Degree'])\
                    .orderBy(loan_data['Age'].asc())\
                    .rangeBetween(-sys.maxsize, 0)

avg_age_so_far = avg(loan_data['Age']).over(window_spec)

In [0]:
loan_data.select('ID', 'Degree', 'Age')\
         .withColumn('avg_age_so_far', avg_age_so_far).display()

ID,Degree,Age,avg_age_so_far
3889,College,23,23.0
2178,College,25,24.0
113,College,26,24.666666666666668
411,College,29,27.375
903,College,29,27.375
2766,College,29,27.375
3105,College,29,27.375
3258,College,29,27.375
1943,College,30,28.5
2023,College,30,28.5
