In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 37 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 49.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=912b0c9e17cb07762303e2d523936a6944d6dca8215101538abcba78e6b4dc06
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
!curl https://raw.githubusercontent.com/markumreed/colab_pyspark/main/sales_data.csv >> sales_data.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   202  100   202    0     0    975      0 --:--:-- --:--:-- --:--:--   975


In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('agg_groupby').getOrCreate()

In [5]:
df = spark.read.csv('sales_data.csv', inferSchema = True, header = True)

In [6]:
df.printSchema()

root
 |-- company: string (nullable = true)
 |-- representative: string (nullable = true)
 |-- num_sales: double (nullable = true)



In [7]:
df.show(5)

+-------+--------------+---------+
|company|representative|num_sales|
+-------+--------------+---------+
|    XYZ|           Bob|    200.0|
|    XYZ|           Tom|    120.0|
|    XYZ|         Frank|    340.0|
|   ABCD|         Jerry|    600.0|
|   ABCD|           Amy|    124.0|
+-------+--------------+---------+
only showing top 5 rows



In [8]:
# grouping by company

df.groupBy('company').mean().show()

+-------+-----------------+
|company|   avg(num_sales)|
+-------+-----------------+
|   BLAH|            370.0|
|    XYZ|            220.0|
|     OK|            610.0|
|   ABCD|322.3333333333333|
+-------+-----------------+



In [9]:
df.groupBy('company').count().show()

+-------+-----+
|company|count|
+-------+-----+
|   BLAH|    4|
|    XYZ|    3|
|     OK|    2|
|   ABCD|    3|
+-------+-----+



In [10]:
df.groupBy('company').min().show()

+-------+--------------+
|company|min(num_sales)|
+-------+--------------+
|   BLAH|         130.0|
|    XYZ|         120.0|
|     OK|         350.0|
|   ABCD|         124.0|
+-------+--------------+



In [11]:
df.groupBy('company').max().show()

+-------+--------------+
|company|max(num_sales)|
+-------+--------------+
|   BLAH|         750.0|
|    XYZ|         340.0|
|     OK|         870.0|
|   ABCD|         600.0|
+-------+--------------+



In [12]:
df.groupBy('company').sum().show()

+-------+--------------+
|company|sum(num_sales)|
+-------+--------------+
|   BLAH|        1480.0|
|    XYZ|         660.0|
|     OK|        1220.0|
|   ABCD|         967.0|
+-------+--------------+



In [14]:
# Aggregation
#using agg function to extract max sale value
df.agg({'num_sales': 'max'}).show()

+--------------+
|max(num_sales)|
+--------------+
|         870.0|
+--------------+



In [15]:
df.groupBy('company').agg({'num_sales':'mean'}).show()

+-------+-----------------+
|company|   avg(num_sales)|
+-------+-----------------+
|   BLAH|            370.0|
|    XYZ|            220.0|
|     OK|            610.0|
|   ABCD|322.3333333333333|
+-------+-----------------+



In [16]:
company_groups = df.groupBy('company')

In [18]:
company_groups.min().show()

+-------+--------------+
|company|min(num_sales)|
+-------+--------------+
|   BLAH|         130.0|
|    XYZ|         120.0|
|     OK|         350.0|
|   ABCD|         124.0|
+-------+--------------+



In [24]:
# function from pyspark.sql.functions

from pyspark.sql.functions import countDistinct, avg, stddev

In [27]:
df.select(avg('num_sales')).show()

+-----------------+
|   avg(num_sales)|
+-----------------+
|360.5833333333333|
+-----------------+



In [29]:
df.select(stddev('num_sales')).show()

+----------------------+
|stddev_samp(num_sales)|
+----------------------+
|    250.08742410799007|
+----------------------+



In [26]:
# alias
df.select(countDistinct('num_sales').alias('Distinct Sales')).show()

+--------------+
|Distinct Sales|
+--------------+
|            11|
+--------------+



In [31]:
from pyspark.sql.functions import format_number

In [32]:
sales_std = df.select(stddev('num_sales').alias('stddev'))

In [33]:
sales_std.show()

+------------------+
|            stddev|
+------------------+
|250.08742410799007|
+------------------+



In [35]:
sales_std.select(format_number('stddev', 2)).show() # rounding to 2 decimal place

+------------------------+
|format_number(stddev, 2)|
+------------------------+
|                  250.09|
+------------------------+



In [37]:
# orderby

df.orderBy('num_sales').show() # arranges in ascending order

+-------+--------------+---------+
|company|representative|num_sales|
+-------+--------------+---------+
|    XYZ|           Tom|    120.0|
|   ABCD|           Amy|    124.0|
|   BLAH|         Linda|    130.0|
|    XYZ|           Bob|    200.0|
|   ABCD|       Vanessa|    243.0|
|   BLAH|          John|    250.0|
|    XYZ|         Frank|    340.0|
|     OK|         Sarah|    350.0|
|   BLAH|         Chris|    350.0|
|   ABCD|         Jerry|    600.0|
|   BLAH|          Mike|    750.0|
|     OK|          Carl|    870.0|
+-------+--------------+---------+

