In [26]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, DoubleType, StructType
from pyspark.sql.functions import countDistinct, avg, stddev, format_number

In [8]:
## Create Spark Session
spark = SparkSession.builder.appName('BasicGroupAgg').getOrCreate()

In [9]:
## Setup Schema
schema = StructType(fields=[StructField('company', StringType(), True),
                            StructField('person', StringType(), True),
                            StructField('sales', DoubleType(), True)])

In [10]:
## Read in Data
df = spark.read.csv('gs://spark-training-data/datasets/sales_info.csv', inferSchema=False,
                    schema=schema, header=True)

In [11]:
## Show Data
df.show()
df.printSchema()

+-------+-------+-----+
|company| person|sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+

root
 |-- company: string (nullable = true)
 |-- person: string (nullable = true)
 |-- sales: double (nullable = true)



In [12]:
## Show column names (attribute)
df.columns

['company', 'person', 'sales']

In [13]:
## Statistical Summary of df
df.describe().show()

+-------+-------+-------+------------------+
|summary|company| person|             sales|
+-------+-------+-------+------------------+
|  count|     12|     12|                12|
|   mean|   null|   null| 360.5833333333333|
| stddev|   null|   null|250.08742410799007|
|    min|   APPL|  Chris|             120.0|
|    max|   MSFT|Vanessa|             870.0|
+-------+-------+-------+------------------+



In [16]:
## Basic GroupBy - Creates a pyspark.sql.group.GroupedData object to call functions
## Many different methods can be called
df.groupBy('company').mean().show()
df.groupBy('company').min().show()
df.groupBy('company').max().show()
df.groupBy('company').count().show()

+-------+-----------------+
|company|       avg(sales)|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+

+-------+----------+
|company|min(sales)|
+-------+----------+
|   APPL|     130.0|
|   GOOG|     120.0|
|     FB|     350.0|
|   MSFT|     124.0|
+-------+----------+

+-------+----------+
|company|max(sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+

+-------+-----+
|company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



In [19]:
## Agg - function for all rows
df.agg({'sales': 'sum'}).show() # Return sum of all sales

+----------+
|sum(sales)|
+----------+
|    4327.0|
+----------+



In [21]:
## Using agg on grouped data object
grouped_data = df.groupBy('company')
grouped_data.agg({'sales': 'sum'}).show()

+-------+----------+
|company|sum(sales)|
+-------+----------+
|   APPL|    1480.0|
|   GOOG|     660.0|
|     FB|    1220.0|
|   MSFT|     967.0|
+-------+----------+



In [31]:
## Using pyspark.sql.functions (imported above)
df.select(avg('sales')).show()
df.select(countDistinct('person')).show()
df.select(stddev('sales').alias('std')).show() # With alias to clean up column name

+-----------------+
|       avg(sales)|
+-----------------+
|360.5833333333333|
+-----------------+

+----------------------+
|count(DISTINCT person)|
+----------------------+
|                    12|
+----------------------+

+------------------+
|               std|
+------------------+
|250.08742410799007|
+------------------+



In [36]:
## Formatting pyspark.sql.functions using format_number
sales_std = df.select(stddev('sales').alias('std'))
sales_std.show()
sales_std.select(format_number('std',2).alias('std')).show() # Cleaning up decimals

+------------------+
|               std|
+------------------+
|250.08742410799007|
+------------------+

+------+
|   std|
+------+
|250.09|
+------+



In [38]:
## Basic Ordering
df.orderBy('sales').show() # Default if ASC
df.orderBy(df['sales'].desc()).show()

+-------+-------+-----+
|company| person|sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+

+-------+-------+-----+
|company| person|sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+

