## Import the Pyspark libraries

In [29]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import DataFrame
from pyspark.sql.types import DateType

## Set a Spark session
The SparkSession is the entry point for high-level Spark functionality.
it is initiated using the `SparkSession.builder`

In [22]:
spk = SparkSession.builder.master('local[4]').appName("globalSales").getOrCreate()

In [16]:
# Also create a spark context
spk_context = spk.sparkContext


The SparkContext is the entry point for low-level Spark functionality. It represents the connection to a Spark cluster and allows you to create RDDs (Resilient Distributed Datasets)

## Read our dataset

In [59]:
global_df = spk.read.csv('data\global_sales_records.csv', header=True, inferSchema=True)

global_df.show(5)

+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|              Region|             Country|    Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|
+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|Middle East and N...|          Azerbaijan|       Snacks|       Online|             C| 10/8/2014|535113847|10/23/2014|       934|    152.58|    97.44|    142509.72|  91008.96|    51500.76|
|Central America a...|              Panama|    Cosmetics|      Offline|             L| 2/22/2015|874708545| 2/27/2015|      4551|     437.2|   263.33|    1989697.2|1198414.83|   791282.37|
|  Sub-Saharan Africa|Sao Tome and Prin...|       Fruit

### Data Information

Get to know about your data...

In [53]:
# Number of rows and colunms
print(f"Data Shape\nRows: {global_df.count()}\nColumns: {len(global_df.columns)}")

Data Shape
Rows: 100000
Columns: 14


In [54]:
global_df.printSchema()

root
 |-- Region: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Item Type: string (nullable = true)
 |-- Sales Channel: string (nullable = true)
 |-- Order Priority: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Ship Date: string (nullable = true)
 |-- Units Sold: integer (nullable = true)
 |-- Unit Price: double (nullable = true)
 |-- Unit Cost: double (nullable = true)
 |-- Total Revenue: double (nullable = true)
 |-- Total Cost: double (nullable = true)
 |-- Total Profit: double (nullable = true)



In [25]:
print(global_df.dtypes)

[('Region', 'string'), ('Country', 'string'), ('Item Type', 'string'), ('Sales Channel', 'string'), ('Order Priority', 'string'), ('Order Date', 'string'), ('Order ID', 'int'), ('Ship Date', 'string'), ('Units Sold', 'int'), ('Unit Price', 'double'), ('Unit Cost', 'double'), ('Total Revenue', 'double'), ('Total Cost', 'double'), ('Total Profit', 'double')]


## Perform some transformations

In [60]:
# Rename the columns, to remove the spaces
def rename_columns_remove_spaces(df: DataFrame) -> DataFrame:
    """Renames columns in a PySpark DataFrame to replace spaces with underscores."""
    for col_name in df.columns:
        new_col_name = col_name.replace(" ", "_")
        df = df.withColumnRenamed(col_name, new_col_name)
    return df

global_df = rename_columns_remove_spaces(global_df)

In [56]:
# Convert the order_date and Ship_date to date object
# Convert Order_Date and Ship_Date to DateType
global_df = global_df.withColumn("Order_Date", F.to_date(F.col("Order_Date"), "dd/MM/yyyy"))
global_df = global_df.withColumn("Ship_Date", F.to_date(F.col("Ship_Date"), "dd/MM/yyyy"))

### Transforming the `Order_Date` and `Ship_Date`

In [66]:
global_df = global_df.withColumn("Order_Date", F.to_date(F.col("Order_Date"), "M/d/yyyy")) \
       .withColumn("Ship_Date", F.to_date(F.col("Ship_Date"), "M/d/yyyy"))

Extract the month and day from the `Order_Date` also the difference in **Days** between the order and ship date

In [76]:
# To Extract Year
global_df = global_df.withColumn("Order_Year", F.year("Order_Date"))

# To Extract Month
global_df = global_df.withColumn("Order_Month", F.date_format("Order_Date", "MMMM"))

# To Extract Days
global_df = global_df.withColumn("Order_Day", F.date_format("Order_Date", "E"))

# To Extract Quarter
global_df = global_df.withColumn("Order_Quarter", F.quarter("Order_Date"))

# To get the difference in days from order to shipping
global_df = global_df.withColumn("Shipping_Time_days", F.datediff("Ship_Date", "Order_Date"))

In [78]:
# Sort the order by order_date
global_df.orderBy("Order_Date").show(5)

+--------------------+--------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+-----------+----------+---------+-------------+------------------+
|              Region| Country|    Item_Type|Sales_Channel|Order_Priority|Order_Date| Order_ID| Ship_Date|Units_Sold|Unit_Price|Unit_Cost|Total_Revenue|Total_Cost|Total_Profit|Order_Month|Order_Year|Order_Day|Order_Quarter|Shipping_Time_days|
+--------------------+--------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+-----------+----------+---------+-------------+------------------+
|              Europe| Finland|    Baby Food|       Online|             H|2010-01-01|431350123|2010-01-09|      4942|    255.28|   159.42|   1261593.76| 787853.64|   473740.12|    January|      2010|      Fri|            1|                 8|
|Central America a...|   Hai

In [77]:
global_df.show(2)

+--------------------+----------+---------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+-----------+----------+---------+-------------+------------------+
|              Region|   Country|Item_Type|Sales_Channel|Order_Priority|Order_Date| Order_ID| Ship_Date|Units_Sold|Unit_Price|Unit_Cost|Total_Revenue|Total_Cost|Total_Profit|Order_Month|Order_Year|Order_Day|Order_Quarter|Shipping_Time_days|
+--------------------+----------+---------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+-----------+----------+---------+-------------+------------------+
|Middle East and N...|Azerbaijan|   Snacks|       Online|             C|2014-10-08|535113847|2014-10-23|       934|    152.58|    97.44|    142509.72|  91008.96|    51500.76|    October|      2014|      Wed|            4|                15|
|Central America a...|    Panama|Cos