# DataFrame OrderBy

In [2]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
simpleData = [
    ("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000),
]

columns= ["employee_name","department","state","salary","age","bonus"]

df = spark.createDataFrame(data = simpleData, schema = columns)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
df.printSchema()

In [None]:
df.show(truncate=False)

Because `orderBy` and `sort` are aliases, they are both interchanged in real usage.

In [None]:
df.sort("department", "state").show(truncate=False) # Use ascending order by default

In [None]:
df.orderBy("department", "state").show(truncate=False); # Use orderBy instead

In [None]:
df.sort(F.col("department"), F.col("state")).show(truncate=False) # Use col() to reference Column

In [None]:
df.orderBy(F.col("department"), F.col("state")).show(truncate=False) # Use orderBy instead

## Order by ascending

In [None]:
df.sort(df.department.asc(), df.state.asc()).show(truncate=False)

In [None]:
df.sort(F.col("department").asc(), F.col("state").asc()).show(truncate=False)

In [None]:
df.orderBy(F.col("department").asc(), F.col("state").asc()).show(truncate=False)

## Order by mixing ascending and descending

In [None]:
df.sort(df.department.asc(),df.state.desc()).show(truncate=False)

In [None]:
df.sort(F.col("department").asc(), F.col("state").desc()).show(truncate=False)

In [None]:
df.orderBy(F.col("department").asc(), F.col("state").desc()).show(truncate=False)

# Data Frame GroupBy

`groupBy` has an alias `groupBy`. Choose them as you like.

In [None]:
df.show(truncate=True)

Notice that the return type of `groupBy` is `GroupedData`, not `DataFrame`.

In [None]:
type(df.groupBy("department"))

We can use aggregate methods of `GroupedData` instance directly.

In [None]:
df.groupBy("department").sum("salary").show(truncate=False)

In [None]:
df.groupBy("department").count().show(truncate=False)

The aggregate methods of `GroupedData` supports multiple arguments to aggregate multiple columns.

In [None]:
(df
  .groupBy("department","state")
  .sum("salary","bonus")
  .show(truncate=False)
)

To alias a aggregate column, use aggregate functions in `pyspark.sql.functions` and chain it wit `alias()`.

In [None]:
(df.groupBy("department")
   .agg(
     F.sum("salary").alias("sum_salary"),
     F.avg("salary").alias("avg_salary"),
     F.sum("bonus").alias("sum_bonus"),
     F.avg("bonus").alias("avg_bonus"),
   )
   .show(truncate=False)
)

## GroupBy and Where

Using `where` after `groupby` operation means applying filtering after aggregation.

In [None]:
(df.groupBy("department")
   .agg(
     F.sum("salary").alias("sum_salary"),
     F.avg("salary").alias("avg_salary"),
     F.sum("bonus").alias("sum_bonus"),
     F.max("bonus").alias("max_bonus"),
   )
   .where(F.col("sum_bonus") >= 50_000)
   .show(truncate=False)
)

## GroupBy and Sort

After getting grouped data from aggregate, we can call `sort()` to order the aggregate column result as a normal dataframe.

In [None]:
dfGroup = (
    df.groupBy("state")
      .agg(F.sum("salary").alias("sum_salary"))
)

In [None]:
dfGroup.show(truncate=False)

In [None]:
dfFilter = dfGroup.filter(dfGroup.sum_salary > 100_000)
dfFilter.show(truncate=False)

In [None]:
dfFilter.sort("sum_salary").show(truncate=False)

In [None]:
dfFilter.sort(F.desc("sum_salary")).show(truncate=False)

In summary, you can chain all operations applying with the grouped data frame within a single statement as below.

In [None]:
(df.groupBy("state")
   .agg(F.sum("salary").alias("sum_salary"))
   .filter(F.col("sum_salary") > 100_000)
   .sort(F.desc("sum_salary"))
   .show(truncate=False)
)

## More about aggregate functions

### approx_count_distinct

In [4]:
df.select(F.approx_count_distinct("salary")).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------------+
|approx_count_distinct(salary)|
+-----------------------------+
|8                            |
+-----------------------------+

### collect_list

In [5]:
df.select(F.collect_list("salary")).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------------------------------------------------+
|collect_list(salary)                                           |
+---------------------------------------------------------------+
|[99000, 83000, 79000, 80000, 91000, 90000, 86000, 81000, 90000]|
+---------------------------------------------------------------+

### collect_set

In [6]:
df.select(F.collect_set("salary")).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------------------------------------------+
|collect_set(salary)                                     |
+--------------------------------------------------------+
|[79000, 83000, 91000, 99000, 90000, 80000, 86000, 81000]|
+--------------------------------------------------------+

### distinct

In [13]:
df.select("department", "salary").distinct().show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------+
|department|salary|
+----------+------+
|Finance   |83000 |
|Marketing |80000 |
|Finance   |79000 |
|Marketing |91000 |
|Finance   |99000 |
|Sales     |90000 |
|Sales     |81000 |
|Sales     |86000 |
|Finance   |90000 |
+----------+------+

### countDistinct

In [17]:
df.select(F.countDistinct("department", "salary")).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|9                                 |
+----------------------------------+

### count

In [19]:
df.select(F.count("salary")).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+
|count(salary)|
+-------------+
|9            |
+-------------+

### first

In [23]:
df.show(truncate=False)
df.select(F.first("salary")).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+

+-------------+
|first(salary)|
+-------------+
|90000        |
+-------------+

### last

In [22]:
df.show(truncate=False)
df.select(F.last("salary")).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+

+------------+
|last(salary)|
+------------+
|91000       |
+------------+

### kurtosis, skewness

In [33]:
(df.select(
        F.kurtosis("salary").alias("kurtosis_salary"),
        F.skewness("salary").alias("skewness_salary"),
        F.stddev("salary").alias("stddev_salary"),
        F.stddev_pop("salary").alias("stddev_pop_salary"),
        F.variance("salary").alias("variance_salary"),
    ).show(truncate=False)
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+------------------+-----------------+-----------------+--------------------+
|kurtosis_salary    |skewness_salary   |stddev_salary    |stddev_pop_salary|variance_salary     |
+-------------------+------------------+-----------------+-----------------+--------------------+
|-0.6275168662506321|0.5530468967432596|6540.472290116194|6166.416411338492|4.2777777777777776E7|
+-------------------+------------------+-----------------+-----------------+--------------------+

### max, min, avg

In [36]:
df.select(
    F.max("salary").alias("max_salary"),
    F.avg("salary").alias("avg_salary"),
    F.min("salary").alias("min_salary"),
).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-----------------+----------+
|max_salary|avg_salary       |min_salary|
+----------+-----------------+----------+
|99000     |86555.55555555556|79000     |
+----------+-----------------+----------+

### sum, sumDistinct

In [37]:
df.select(
    F.sum("salary"),
    F.sumDistinct("salary"),
).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+--------------------+
|sum(salary)|sum(DISTINCT salary)|
+-----------+--------------------+
|779000     |689000              |
+-----------+--------------------+