In [None]:
spark

## Convert column name to convention

It is possible that there are variety of naming columns or fields of raw data, however we should have a common convention for our data in datalake so that data users can use it with intuition and no confusion.

As our tables in raw have coluns in `camel-casing` convention, but we need `snake-casing` in our datalake.

![Data Models](https://www.mysqltutorial.org/wp-content/uploads/2009/12/MySQL-Sample-Database-Schema.png)

For an example, `productlines` need to change columns as following:

- `productLine` changed to `product_line`
- `textDescription` chagned to `text_description`
- `htmlDescription` changed to `html_description`
- `image` nothing to change

We need to transform column names and store all tranformed models in: `s3a://datalake/exercises/bronze/classicmodels/<table>.parquet`

## Productlines

In [None]:
%%sql

create external table raw_productlines

using parquet

location 's3a://datalake/exercises/raw/classicmodels/productlines.parquet'

In [None]:
%%sql

create external table bronze_productlines (
    product_line string, 
    text_description string,
    html_description string,
    image binary
)

using parquet

location 's3a://datalake/exercises/bronze/classicmodels/productlines.parquet'

In [None]:
%%sql

insert overwrite bronze_productlines

select * from raw_productlines 

In [None]:
df = spark.read.format("parquet").load("s3a://datalake/exercises/bronze/classicmodels/productlines.parquet")

assert sorted(df.columns) == ['html_description', 'image', 'product_line', 'text_description']

## Products

In [None]:
%%sql

create external table raw_products

using parquet

location 's3a://datalake/exercises/raw/classicmodels/products.parquet'

In [None]:
%%sql

create external table bronze_products (
    product_code string,
    product_name string,
    product_line string,
    product_scale string,
    product_vendor string,
    product_description string,
    quantity_in_stock integer,
    buy_price decimal(10, 2),
    msrp decimal(10, 2)
)

using parquet

location 's3a://datalake/exercises/bronze/classicmodels/products.parquet'


In [None]:
%%sql

insert overwrite bronze_products

select * from raw_products
    

In [None]:
df = spark.read.format("parquet").load("s3a://datalake/exercises/bronze/classicmodels/products.parquet")

assert sorted(df.columns) == ['buy_price', 'msrp', 'product_code',
                              'product_description', 'product_line', 'product_name',
                              'product_scale', 'product_vendor', 'quantity_in_stock']

## Employees

In [None]:
%%sql

create external table raw_employees

using parquet

location 's3a://datalake/exercises/raw/classicmodels/employees.parquet'


In [None]:
%%sql

create external table bronze_employees (
    employee_number integer,
    last_name string,
    first_name string,
    extension string,
    email string,
    office_code string,
    reports_to integer,
    job_title string
)

using parquet

location 's3a://datalake/exercises/bronze/classicmodels/employees.parquet'


In [None]:
%%sql

insert overwrite bronze_employees

select * from raw_employees

In [None]:
df = spark.read.format("parquet").load("s3a://datalake/exercises/bronze/classicmodels/employees.parquet")

assert sorted(df.columns) == ['email', 'employee_number', 'extension',
                              'first_name', 'job_title', 'last_name',
                              'office_code', 'reports_to']

## Offices

In [None]:
%%sql

create external table raw_offices

using parquet

location 's3a://datalake/exercises/raw/classicmodels/offices.parquet'


In [None]:
%%sql

create external table bronze_offices (
    office_code string,
    city string,
    phone string,
    address_line_1 string,
    address_line_2 string,
    state string,
    country string,
    postal_code string,
    territory string
)

using parquet

location 's3a://datalake/exercises/bronze/classicmodels/offices.parquet'


In [None]:
%%sql

insert overwrite bronze_offices

select * from raw_offices


In [None]:
df = spark.read.format("parquet").load("s3a://datalake/exercises/bronze/classicmodels/offices.parquet")

assert sorted(df.columns) == ['address_line_1', 'address_line_2', 'city',
                              'country', 'office_code', 'phone',
                              'postal_code', 'state', 'territory']

## Customers

In [None]:
%%sql

create external table raw_customers

using parquet

location 's3a://datalake/exercises/raw/classicmodels/customers.parquet'


In [None]:
%%sql

create external table bronze_customers (
    customer_number integer,
    customer_name string,
    contact_last_name string,
    contact_first_name string,
    phone string,
    address_line_1 string,
    address_line_2 string,
    city string,
    state string,
    postal_code string,
    country string,
    sales_rep_employee_number integer,
    credit_limit decimal(10,2)
)

using parquet

location 's3a://datalake/exercises/bronze/classicmodels/customers.parquet'


In [None]:
%%sql

insert overwrite bronze_customers

select * from raw_customers


In [None]:
df = spark.read.format("parquet").load("s3a://datalake/exercises/bronze/classicmodels/customers.parquet")

assert sorted(df.columns) == ['address_line_1', 'address_line_2', 'city',
                              'contact_first_name', 'contact_last_name', 'country',
                              'credit_limit', 'customer_name', 'customer_number',
                              'phone', 'postal_code', 'sales_rep_employee_number',
                              'state']

## Payments

In [None]:
%%sql

create external table raw_payments

using parquet

location 's3a://datalake/exercises/raw/classicmodels/payments.parquet'


In [None]:
%%sql

create external table bronze_payments (
    customer_number integer,
    check_number string,
    payment_date date,
    amount decimal(10, 2)
)

using parquet

location 's3a://datalake/exercises/bronze/classicmodels/payments.parquet'


In [None]:
%%sql

insert overwrite bronze_payments

select * from raw_payments

In [None]:
df = spark.read.format("parquet").load("s3a://datalake/exercises/bronze/classicmodels/payments.parquet")

assert sorted(df.columns) == ['amount', 'check_number', 'customer_number', 'payment_date']

## Orders

In [None]:
%%sql

create external table raw_orders

using parquet

location 's3a://datalake/exercises/raw/classicmodels/orders.parquet'


In [None]:
%%sql

create external table bronze_orders (
    order_number integer,
    order_date date,
    required_date date,
    shipped_date date,
    status string,
    comments string,
    customer_number integer
)

using parquet

location 's3a://datalake/exercises/bronze/classicmodels/orders.parquet'


In [None]:
%%sql

insert overwrite bronze_orders

select * from raw_orders


In [None]:
df = spark.read.format("parquet").load("s3a://datalake/exercises/bronze/classicmodels/orders.parquet")

assert sorted(df.columns) == ['comments', 'customer_number', 'order_date',
                              'order_number', 'required_date', 'shipped_date',
                              'status']

## Orderdetails

In [None]:
%%sql

create external table raw_orderdetails

using parquet

location 's3a://datalake/exercises/raw/classicmodels/orderdetails.parquet'


In [None]:
%%sql

describe raw_orderdetails

In [None]:
%%sql

create external table bronze_orderdetails (
    order_number integer,
    product_code string,
    quantity_ordered int,
    price_each decimal(10, 2),
    order_line_number integer
)

using parquet

location 's3a://datalake/exercises/bronze/classicmodels/orderdetails.parquet'


In [None]:
%%sql

insert overwrite bronze_orderdetails

select * from raw_orderdetails


In [None]:
df = spark.read.format("parquet").load("s3a://datalake/exercises/bronze/classicmodels/orderdetails.parquet")

assert sorted(df.columns) == ['order_line_number', 'order_number', 'price_each', 'product_code', 'quantity_ordered']