In [None]:
spark

# Extract data from source to datalake

In this exercise, we will extract data from datasource, which is MySQL server database called `classicmodels`, and load the extracted into datalake. We will process those data later exercises.

![Data Model](https://www.mysqltutorial.org/wp-content/uploads/2009/12/MySQL-Sample-Database-Schema.png)

The extracted data should be located at `s3a://datalake/raw/classicmodels/<table_name>.parquet`.

In [None]:
from pyspark.sql import DataFrame

def load_table_df(table_name) -> DataFrame:
    return (
        spark
            .read
            .format("jdbc")
            .option("driver", "com.mysql.jdbc.Driver")
            .option("url", "jdbc:mysql://localhost/classicmodels")
            .option("dbtable", table_name)
            .option("user", "root")
            .option("password", "")
            .load()
    )

## productlines table

In [None]:
productlines = load_table_df("productlines")

In [None]:
(
    productlines
        .write
        .format("parquet")
        .mode("overwrite")
        .save("s3a://datalake/exercises/raw/classicmodels/productlines.parquet")
)

In [None]:
row_count = spark.read.format("parquet").load("s3a://datalake/exercises/raw/classicmodels/productlines.parquet").count()

In [None]:
assert row_count == 7, "Productlines was not loaded complete."

print("Productlines was loaded complete.")

## products table

In [None]:
products = load_table_df("products")

In [None]:
(
    products
        .write
        .format("parquet")
        .mode("overwrite")
        .save("s3a://datalake/exercises/raw/classicmodels/products.parquet")
)

In [None]:
row_count = spark.read.format("parquet").load("s3a://datalake/exercises/raw/classicmodels/products.parquet").count()

In [None]:
assert row_count == 110, "Products was not loaded complete."

print("Products was loaded complete.")

## offices table

In [None]:
offices = load_table_df("offices")

In [None]:
(
    offices
        .write
        .format("parquet")
        .mode("overwrite")
        .save("s3a://datalake/exercises/raw/classicmodels/offices.parquet")
)

In [None]:
row_count = spark.read.format("parquet").load("s3a://datalake/exercises/raw/classicmodels/offices.parquet").count()

In [None]:
assert row_count == 7, "Offices was not loaded complete."

print("Offices was loaded complete.")

## employees table

In [None]:
employees = load_table_df("employees")

In [None]:
(
    employees
        .write
        .format("parquet")
        .mode("overwrite")
        .save("s3a://datalake/exercises/raw/classicmodels/employees.parquet")
)

In [None]:
row_count = spark.read.format("parquet").load("s3a://datalake/exercises/raw/classicmodels/employees.parquet").count()

In [None]:
assert row_count == 23, "Employees was not loaded complete."

print("Employees was loaded complete.")

## customers table

In [None]:
customers = load_table_df("customers")

In [None]:
(
    customers
        .write
        .format("parquet")
        .mode("overwrite")
        .save("s3a://datalake/exercises/raw/classicmodels/customers.parquet")
)

In [None]:
row_count = spark.read.format("parquet").load("s3a://datalake/exercises/raw/classicmodels/customers.parquet").count()

In [None]:
assert row_count == 122, "Customers was not loaded complete."

print("Customers was loaded complete.")

## payments table

In [None]:
payments = load_table_df("payments")

In [None]:
(
    payments
        .write
        .format("parquet")
        .mode("overwrite")
        .save("s3a://datalake/exercises/raw/classicmodels/payments.parquet")
)

In [None]:
row_count = spark.read.format("parquet").load("s3a://datalake/exercises/raw/classicmodels/payments.parquet").count()

In [None]:
assert row_count == 273, "Payments was not loaded complete."

print("Payments was loaded complete.")

## orders table

In [None]:
orders = load_table_df("orders")

In [None]:
(
    orders
        .write
        .format("parquet")
        .mode("overwrite")
        .save("s3a://datalake/exercises/raw/classicmodels/orders.parquet")
)

In [None]:
row_count = spark.read.format("parquet").load("s3a://datalake/exercises/raw/classicmodels/orders.parquet").count()

In [None]:
assert row_count == 326, "Orders was not loaded complete."

print("Orders was loaded complete.")

## orderdetails table

In [None]:
orderdetails = load_table_df("orderdetails")

In [None]:
(
    orderdetails
        .write
        .format("parquet")
        .mode("overwrite")
        .save("s3a://datalake/exercises/raw/classicmodels/orderdetails.parquet")
)

In [None]:
row_count = spark.read.format("parquet").load("s3a://datalake/exercises/raw/classicmodels/orderdetails.parquet").count()

In [None]:
assert row_count == 2996, "Orderdetails was not loaded complete"

print("Orderdetails was loaded complete.")