In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower
from pyspark.sql.types import StringType

In [2]:
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Fetch Rewards Exercise") \
    .getOrCreate()

25/05/14 22:29:00 WARN Utils: Your hostname, Johns-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.130 instead (on interface en0)
25/05/14 22:29:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/14 22:29:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
brandsDF = spark.read.json("brands.jsonl")

In [4]:
brandsDF.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- barcode: string (nullable = true)
 |-- brandCode: string (nullable = true)
 |-- category: string (nullable = true)
 |-- categoryCode: string (nullable = true)
 |-- cpg: struct (nullable = true)
 |    |-- $id: struct (nullable = true)
 |    |    |-- $oid: string (nullable = true)
 |    |-- $ref: string (nullable = true)
 |-- name: string (nullable = true)
 |-- topBrand: boolean (nullable = true)



#### Print the number of lines of JSON objects, i.e. number of Brands in the file

In [5]:
brandsDF.count()

1167

#### Roughly half the Brand JSON objects do not have a field for `topBrand`
* This means that when generating metrics from Receipts you can't group by this field on all brands
* There are many Brands that do have a `topBrand` field that is `false` so we can't assume those without a field aren't a top brand

For refernce adding a `~` character negates the condition

In [6]:
brandsDF.filter(col("topBrand").isNull()).count()

612

Print the number of Brands that have a `topBrand` field which is set to `false`

In [7]:
brandsDF.filter(~col("topBrand")).count()

524

Print the number of Brands that have a `topBrand` field which is set to `true`

In [8]:
brandsDF.filter(col("topBrand")).count()

31

#### Many Brands do not have a field for `category`
* While this is not necessary based on the data model, it is still a lack of useful information


In [9]:
brandsDF.filter(col("category").isNull()).count()

155

#### More than half of Brands do not have a field for `categoryCode`
* This field likely is generated from a foreign key and therefore is very valuable for calculating metrics
* Comparing Receipt Items by Category for Brands that are in the same category could easily be done if all Brands had a corresponding code


In [10]:
brandsDF.filter(col("categoryCode").isNull()).count()

650

#### Many Brands that have a field for `brandCode` are set to only a numerical value which equals the `barcode` field
* This calls into question if the `brandCode` field has any value


In [11]:
brandsDF.filter(col("brandCode") == col("barcode")).count()

54

#### Many of the Brands in the file are test data
* This could be an indication of test data mixed with production data

In [12]:
lowercaseDF = brandsDF.withColumn("name", lower(col("name")))

In [13]:
lowercaseDF.filter(lowercaseDF.name.contains('test')).count()

432