# Analyze data in a data lake with Spark

In [1]:
%%pyspark

# explore csv data files using synapse serverless spark pool

df = spark.read.load('abfss://root@azuredatalakelabs.dfs.core.windows.net/DP500/csv/2019/2019.csv', format='csv'
##, header=True
##, inferschema=True
)

display(df.limit(5))

# Reads data from all of the CSV files in the folder
- Define schema

In [2]:
%%pyspark

from pyspark.sql.types import *
from pyspark.sql.functions import *

orderSchema = StructType([
    StructField("SalesOrderNumber", StringType()),
    StructField("SalesOrderLineNumber", IntegerType()),
    StructField("OrderDate", DateType()),
    StructField("CustomerName", StringType()),
    StructField("Email", StringType()),
    StructField("Item", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("UnitPrice", FloatType()),
    StructField("Tax", FloatType())
    ])

df = spark.read.load('abfss://root@azuredatalakelabs.dfs.core.windows.net/DP500/csv/*/*.csv', format='csv', schema=orderSchema)
display(df.limit(5))

In [3]:
df.printSchema()

In [4]:
%%pyspark

# Selecting fields
customers = df['CustomerName', 'Email']

# display number of customers
print(str(customers.count()) + " of customers")

# display distinct number of customers
print(str(customers.distinct().count()) + " of distinct customers")

# display 5 distinct number of customers
display(customers.distinct().limit(5))

## Aggregate and group data in a dataframe

In [5]:
yearlySales =\
df.select(year("OrderDate")\
  .alias("Year"))\
  .groupBy("Year")\
  .count().orderBy("Year")
  
display(yearlySales)

## Query data using Spark SQL
**createOrReplaceTempView**
- Spark SQL views are lazily evaluated meaning it does not persist in memory unless you cache the dataset by using the cache() method.

In [6]:
df.createOrReplaceTempView("salesorders")

spark_df = spark.sql("SELECT * FROM salesorders")

display(spark_df.limit(5))

In [7]:
%%sql
SELECT YEAR(OrderDate) AS OrderYear,
       SUM((UnitPrice * Quantity) + Tax) AS GrossRevenue
FROM salesorders
GROUP BY YEAR(OrderDate)
ORDER BY OrderYear;

## View results as a built-in chart

In [8]:
%%sql
SELECT * FROM salesorders

## You can leverage matplotlib
- Matplotlib is an amazing visualization library in Python for 2D plots of arrays

In [9]:
sqlQuery = "SELECT CAST(YEAR(OrderDate) AS CHAR(4)) AS OrderYear, \
                SUM((UnitPrice * Quantity) + Tax) AS GrossRevenue \
            FROM salesorders \
            GROUP BY CAST(YEAR(OrderDate) AS CHAR(4)) \
            ORDER BY OrderYear"
df_spark = spark.sql(sqlQuery)
df_spark.show()

In [10]:
from matplotlib import pyplot as plt

# matplotlib requires a Pandas dataframe, not a Spark one
df_sales = df_spark.toPandas()

# Create a bar plot of revenue by year
plt.bar(x=df_sales['OrderYear'], height=df_sales['GrossRevenue'])

# Display the plot
plt.show()

In [11]:
# Clear the plot area
plt.clf()

# Create a figure for 2 subplots (1 row, 2 columns)
fig, ax = plt.subplots(1, 2, figsize = (10,4))

# Create a bar plot of revenue by year on the first axis
ax[0].bar(x=df_sales['OrderYear'], height=df_sales['GrossRevenue'], color='orange')
ax[0].set_title('Revenue by Year')

# Create a pie chart of yearly order counts on the second axis
yearly_counts = df_sales['OrderYear'].value_counts()
ax[1].pie(yearly_counts)
ax[1].set_title('Orders per Year')
ax[1].legend(yearly_counts.keys().tolist())

# Add a title to the Figure
fig.suptitle('Sales Data')

# Show the figure
plt.show()

## Seaborn is an amazing visualization library for statistical graphics plotting in Python.

In [12]:
import seaborn as sns

# Clear the plot area
plt.clf()

# Create a bar chart
ax = sns.barplot(x="OrderYear", y="GrossRevenue", data=df_sales)
plt.show()

In [13]:
# Clear the plot area
plt.clf()

# Create a bar chart
ax = sns.lineplot(x="OrderYear", y="GrossRevenue", data=df_sales)
plt.show()