In [None]:
!pip install -q pyspark

[K     |████████████████████████████████| 281.4 MB 54 kB/s 
[K     |████████████████████████████████| 199 kB 68.5 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
spark

In [None]:
df = spark.range(10000000).toDF("number")

In [None]:
df.count()

10000000

# Load Dataset CSV

In [None]:
df = spark.read.option("delimiter", ";").csv("retail-data.csv")
df.printSchema()  # see schema from dataset

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)



In [None]:
df.count()

3108

In [None]:
df.show(5)

+------+--------------------+----+-------+----+
|   _c0|                 _c1| _c2|    _c3| _c4|
+------+--------------------+----+-------+----+
|536365|WHITE HANGING HEA...|2.55|17850.0|null|
|536365| WHITE METAL LANTERN|3.39|17850.0|null|
|536365|CREAM CUPID HEART...|2.75|17850.0|null|
|536365|KNITTED UNION FLA...|3.39|17850.0|null|
|536365|RED WOOLLY HOTTIE...|3.39|17850.0|null|
+------+--------------------+----+-------+----+
only showing top 5 rows



In [None]:
# rename and drop column
df = df.drop("_c4")
df = df.withColumnRenamed("_c0","InvoiceNo") \
    .withColumnRenamed("_c1","Description") \
    .withColumnRenamed("_c2","Amount") \
    .withColumnRenamed("_c3","CustomerID")
df.show(5)

+---------+--------------------+------+----------+
|InvoiceNo|         Description|Amount|CustomerID|
+---------+--------------------+------+----------+
|   536365|WHITE HANGING HEA...|  2.55|   17850.0|
|   536365| WHITE METAL LANTERN|  3.39|   17850.0|
|   536365|CREAM CUPID HEART...|  2.75|   17850.0|
|   536365|KNITTED UNION FLA...|  3.39|   17850.0|
|   536365|RED WOOLLY HOTTIE...|  3.39|   17850.0|
+---------+--------------------+------+----------+
only showing top 5 rows



In [None]:
from pyspark.sql.types import *

# mapping schema and rename column
manualSchema = StructType([
    StructField("InvoiceNo", IntegerType(), True),
    StructField("Description", StringType(), True),
    StructField("Amount", FloatType(), True),
    StructField("CustomerID", StringType(), True)
])
df = spark.read.option("delimiter", ";").schema(manualSchema).csv("retail-data.csv")
df.printSchema()  # see schema from dataset

root
 |-- InvoiceNo: integer (nullable = true)
 |-- Description: string (nullable = true)
 |-- Amount: float (nullable = true)
 |-- CustomerID: string (nullable = true)



In [None]:
df.show(5)

+---------+--------------------+------+----------+
|InvoiceNo|         Description|Amount|CustomerID|
+---------+--------------------+------+----------+
|   536365|WHITE HANGING HEA...|  2.55|   17850.0|
|   536365| WHITE METAL LANTERN|  3.39|   17850.0|
|   536365|CREAM CUPID HEART...|  2.75|   17850.0|
|   536365|KNITTED UNION FLA...|  3.39|   17850.0|
|   536365|RED WOOLLY HOTTIE...|  3.39|   17850.0|
+---------+--------------------+------+----------+
only showing top 5 rows



# Example Essay

In [None]:
import pyspark.sql.functions as func
df_total = df.groupby("CustomerID").agg(func.count("Amount").alias("Total"))

In [None]:
df_total.show(5)

+----------+-----+
|CustomerID|Total|
+----------+-----+
|   18085.0|    9|
|   17905.0|   23|
|   17377.0|   15|
|   17850.0|   84|
|   17181.0|    2|
+----------+-----+
only showing top 5 rows

