In [1]:
import os

SPARK_VERSION = "spark-3.2.0"
HADOOP_VERSION = "hadoop2.7"
SPARK_FILE = f"{SPARK_VERSION}-bin-{HADOOP_VERSION}.tgz"

os.environ["SPARK_DISTRO"] = f"https://archive.apache.org/dist/spark/{SPARK_VERSION}/{SPARK_FILE}"
os.environ["SPARK_HOME"] = f"/content/{SPARK_VERSION}-bin-{HADOOP_VERSION}"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["JAVA_DISTRO"] = "openjdk-8-jdk-headless"

In [2]:
# INSTALL SPARK

!SPARK_VERSION=spark-3.2.0
!HADOOP_VERSION=hadoop2.7
!apt-get install $JAVA_DISTRO -qq > /dev/null
!wget -q $SPARK_DISTRO
!tar xf $SPARK_FILE
!pip -q install -q findspark
!pip -q install pyspark

# GATHER DATA

!wget -q https://github.com/lexerdev/pairing-sessions/raw/main/data/orders.csv
!wget -q https://github.com/lexerdev/pairing-sessions/raw/main/data/customers.csv

In [3]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.2.0-bin-hadoop2.7'

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark Session").getOrCreate()

spark
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

## Customer Data

This is a Brazilian ecommerce public dataset of orders made at [Olist Store](http://www.olist.com/). The dataset has information of 100k orders from 2016 to 2018 made at multiple marketplaces in Brazil.

Its features allows viewing an order from multiple dimensions: from **order status**, price, payment and freight performance to customer location, product attributes and finally reviews written by customers. We also released a geolocation dataset that relates Brazilian zip codes to lat/lng coordinates.

This is real commercial data, it has been anonymised.

**Entity Relationship Diagram**

Below is an Entity Relationship Diagram of the available data:

![ERD](https://github.com/lexerdev/pairing-sessions/raw/main/images/erd.png)


**Data Dictionary**

**customers**

* `customer_id`:  Customer ID number
* `customer_unique_id`: Unique ID of the Customer from resolution
* `customer_zip_code_prefix`: Zip code of customer
* `customer_city`: City of customer
* `customer_state`: State of customer

**orders**


* `order_id`: Order ID number
* `customer_id`:  Customer ID number
* `order_status`: Status of order process
* `order_purchase_timestamp`: Time of the product is ordered
* `order_approved_at`: Time of payment for the order
* `order_delivered_carrier_date`: Time of notifying the logistics in order process
* `order_delivered_customer_date`: Time of the product arrived at the consumer
* `order_estimated_delivery_date`: Estimated arrival time of the order

**order_items**

* `order_item_id`: The number of the product in the order (ex：If there are three items in the order, the order_item_id is 1, 2, 3)
* `product_id`: Product ID number
* `seller_id `: Seller ID number
* `shipping_limit_date`: The deadline for the seller deliver the goods to the logistics
* `price`: Product price
* `freight_value`: delivery fee

In [5]:
orders = spark.read.csv('/content/orders.csv', header=True, inferSchema=True)
orders.createOrReplaceTempView('orders')

customers = spark.read.csv('/content/customers.csv', header=True, inferSchema=True)
customers.createOrReplaceTempView('customers')

In [6]:
spark.sql(
    '''
    select *
    from orders
    '''
)

order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
e481f51cbdc54678b...,9ef432eb625129730...,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
53cdb2fc8bc7dce0b...,b0830fb4747a6c6d2...,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
47770eb9100c2d0c4...,41ce2a54c0b03bf34...,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
949d5b44dbf5de918...,f88197465ea7920ad...,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
ad21c59c0840e6cb8...,8ab97904e6daea886...,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00
a4591c265e18cb1dc...,503740e9ca751ccdd...,delivered,2017-07-09 21:57:05,2017-07-09 22:10:13,2017-07-11 14:58:04,2017-07-26 10:57:55,2017-08-01 00:00:00
136cce7faa42fdb2c...,ed0271e0b7da060a3...,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,,,2017-05-09 00:00:00
6514b8ad8028c9f2c...,9bdf08b4b3b52b552...,delivered,2017-05-16 13:10:30,2017-05-16 13:22:11,2017-05-22 10:07:46,2017-05-26 12:55:51,2017-06-07 00:00:00
76c6e866289321a7c...,f54a9f0e6b351c431...,delivered,2017-01-23 18:29:09,2017-01-25 02:50:47,2017-01-26 14:16:31,2017-02-02 14:08:10,2017-03-06 00:00:00
e69bfb5eb88e0ed6a...,31ad1d1b63eb99624...,delivered,2017-07-29 11:55:02,2017-07-29 12:05:32,2017-08-10 19:45:24,2017-08-16 17:14:30,2017-08-23 00:00:00


In [7]:
spark.sql(
    '''
    select *
    from customers
    '''
)

customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
06b8999e2fba1a1fb...,861eff4711a542e4b...,14409,franca,SP
18955e83d337fd6b2...,290c77bc529b7ac93...,9790,sao bernardo do c...,SP
4e7b3e00288586ebd...,060e732b5b29e8181...,1151,sao paulo,SP
b2b6027bc5c5109e5...,259dac757896d24d7...,8775,mogi das cruzes,SP
4f2d8ab171c80ec83...,345ecd01c38d18a90...,13056,campinas,SP
879864dab9bc30475...,4c93744516667ad3b...,89254,jaragua do sul,SC
fd826e7cf63160e53...,addec96d2e059c80c...,4534,sao paulo,SP
5e274e7a0c3809e14...,57b2a98a409812fe9...,35182,timoteo,MG
5adf08e34b2e99398...,1175e95fb47ddff9d...,81560,curitiba,PR
4b7139f34592b3a31...,9afe194fb833f79e3...,30575,belo horizonte,MG


## Questions