In [None]:
import pyspark.sql.functions as f

So far, you've learned about visualizing PySpark DataFrame properties, data cleaning using PySpark, and how to create new features.

Now it's time to apply this knowledge to a new dataset.

In this notebook, you'll work with the products dataset that we've downloaded together with the orders dataset. 

1. Load the dataset into a spark dataframe and display it

In [None]:
df_products = (
    spark.read.format('csv')
    .option('inferSchema', 'true')
    .option('header', 'true')
    .option('sep', ',')
    .load('/FileStore/lp-big-data/orders-data/product-supplier.csv')
)

df_products.display()

2. Print the schema

In [None]:
df_products.printSchema()

3. How many observations does the dataset contain?

In [None]:
df_products.count()

4. Rename the columns to standardize them.

- Lower case all column names
- Replace the spaces with underscores `_`

In [None]:
df_products_renamed = (
    df_products
    .toDF('product_id',
        'product_line',
        'product_category',
        'product_group',
        'product_name',
        'supplier_country',
        'supplier_name',
        'supplier_id',
        'announcement_date',
        'launch_date',
    )
)

df_products_renamed.display()

5. Check if there are missing values

In [None]:
df_products_renamed.describe().display()

Replace the missing categories by 'Unknown'

In [None]:
df_products_filled = (
    df_products_renamed
    .fillna('Unknown', subset=(['product_category']))
)

df_products_filled.describe().display()

6. What are the names of all possible product lines?

In [None]:
df_products_filled.groupBy('product_line').count().display()

7. Create a new column called `lead_time` with the amount of time in days between each product's announcement and launch dates

Hint: Check out the [DateTime functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#datetime-functions) from the PySpark SQL library

In [None]:
df_products_lead = (
    df_products_filled
    .withColumn(
        'lead_time',
        f.datediff(f.col('launch_date'), f.col('announcement_date'))
    )
)

df_products_lead.display()

8. The announcement and launch dates are in UTC format. Convert them to New York's timezone.

In [None]:
df_products_timezone = (
    df_products_lead
    .withColumn('announcement_date', f.from_utc_timestamp(f.col('announcement_date'), 'America/New_York'))
    .withColumn('launch_date', f.from_utc_timestamp(f.col('launch_date'), 'America/New_York'))
)

df_products_timezone.display()

9. Create a new column with the suppliers' continent

In [None]:
df_products_timezone.groupBy('supplier_country').count().display()

In [None]:
df_products_preprocessed = (
    df_products_timezone
    .withColumn(
        'supplier_continent',
        f.when(
            f.col('supplier_country').isin(
                ['NL', 'PT', 'GB', 'DE', 'ES', 'FR', 'NO', 'DK', 'BE', 'SE']
                ),
            'Europe'
        )
        .when(f.col('supplier_country').isin(['CA', 'US']), 'America')
        .when(f.col('supplier_country').isin(['AU']), 'Oceania')
        .otherwise(f.lit('Unkown'))
    )
)

df_products_preprocessed.display()

10. Save the data to the dbfs to the folder `/FileStore/lp-big-data/preprocessed-data/orders-data/` under the name `/products-preprocessed.csv`.

In [None]:
df_products_preprocessed.write.csv('/FileStore/lp-big-data/preprocessed-data/orders-data/products-preprocessed.csv', header=True)