In [23]:
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
import datetime
from pyspark.sql.functions import to_timestamp

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 25, Finished, Available)

## **Load a small part of the first month and the first year of the dataset**

In [24]:
year = 2015
month = 1
path = 'abfss://4155d96d-1328-43ad-982f-7aecb1ee2008@onelake.dfs.fabric.microsoft.com/8dfa5c43-7d50-4754-9b90-3c237db830de/Files'
file_name = f'/dataset/year={year}/month={month}'

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 26, Finished, Available)

In [25]:
path + file_name + "/*.csv"

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 27, Finished, Available)

'abfss://4155d96d-1328-43ad-982f-7aecb1ee2008@onelake.dfs.fabric.microsoft.com/8dfa5c43-7d50-4754-9b90-3c237db830de/Files/dataset/year=2015/month=1/*.csv'

In [26]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read CSV with PySpark") \
    .getOrCreate()



# Read the CSV file into a DataFrame
df = spark.read.format("csv") \
        .option("header", "true") \
        .option("encoding", "ISO-8859-1") \
        .option("inferSchema", "true") \
        .load("Files/DataCoSupplyChainDataset.csv")

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 28, Finished, Available)

In [27]:
df.count()

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 29, Finished, Available)

180519

> Just take data from the first month of first year of the dataset

In [28]:
# # ISO-8859-1 UTF-8
# df = spark.read.format("csv") \
#         .option("header", "true") \
#         .option("encoding", "ISO-8859-1") \
#         .option("inferSchema", "true") \
#         .load(path + file_name + "/*.csv")
# df = df.filter(df["order date (DateOrders)"].like("1/%/2015%")) \
#         .orderBy('order date (DateOrders)')

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 30, Finished, Available)

In [29]:
df = df.filter((col("order date (DateOrders)").like("%/2015%")) | 
               (col("order date (DateOrders)").like("%/2016%")) |
               (col("order date (DateOrders)").like("%/2017%"))) \
                .orderBy('order date (DateOrders)')


StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 31, Finished, Available)

In [30]:
df.count()

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 32, Finished, Available)

178396

> 5322 records in the first month and first year

## **Just take a sample with 100 records according to the following things**

> ***Tạo một sample data mẫu sao cho các bảng mapping (order_status, delivery_risk, shipping_mode, transaction_type, delivery_status) bị thiếu ít nhất một loại***

In [31]:
# # Apply the filters
# filtered_df = df
# filtered_df = filtered_df.filter(col('Type').isin(['CASH', 'PAYMENT', 'DEBIT']))
# filtered_df = filtered_df.filter(col('Delivery Status').isin(['Advance shipping', 'Late delivery']))
# filtered_df = filtered_df.filter(col('Shipping Mode').isin(['Same Day', 'Standard Class']))
# filtered_df = filtered_df.filter(col('Order Status').isin(['CLOSED', 'COMPLETE', 'PROCESSING', 'PENDING']))

# # Check the number of rows in the filtered dataset
# num_rows = filtered_df.count()
# print(f"Number of rows in filtered dataset: {num_rows}")

# # If there are more than 100 rows, sample 100 of them
# if num_rows > 100:
#     sample_df = filtered_df.sample(False, 1.0).limit(100)
# else:
#     sample_df = filtered_df
# # sample_df = filtered_df

# # Show the number of rows in the sample
# sample_df_count = sample_df.count()
# print(f"Number of rows in sample: {sample_df_count}")

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 33, Finished, Available)

In [32]:
# sample_df.createOrReplaceTempView("sample_df")
df.createOrReplaceTempView("df")

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 34, Finished, Available)

In [33]:
# dataset
# df = sample_df
df.count()

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 35, Finished, Available)

178396

## **Save the data loading**

In [34]:
df.write \
    .mode("overwrite") \
    .format("csv") \
    .option("header", "true") \
    .save("Files/dataco/dataset_uploading")

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 36, Finished, Available)

## **Data Preprocessing**

In [35]:
def clean_column_names(df):
    for col in df.columns:
        new_col = col.strip().replace(" ", "_").replace("(", "").replace(")", "").lower()
        df = df.withColumnRenamed(col, new_col)
    return df

# Now let's clean the columns of your dataframe
df = clean_column_names(df)

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 37, Finished, Available)

In [36]:
null_counts = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])
null_counts.select([c for c in null_counts.columns if null_counts.collect()[0][c] > 0]).show()

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 38, Finished, Available)

+--------------+----------------+-------------+-------------------+
|customer_lname|customer_zipcode|order_zipcode|product_description|
+--------------+----------------+-------------+-------------------+
|             7|               3|       153556|             178396|
+--------------+----------------+-------------+-------------------+



In [37]:
# drop 2 cột có nhiều giá trị null nhất
df = df.drop('order_zipcode', 'product_description', 'customer_zipcode')

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 39, Finished, Available)

In [38]:
df = df.withColumnRenamed("type", "transaction_type") \
    .withColumnRenamed("order_date_dateorders", "order_date") \
    .withColumnRenamed("shipping_date_dateorders", "shipping_date")

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 40, Finished, Available)

In [39]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
df = df.withColumn("order_date", to_timestamp(df["order_date"], "MM/dd/yyyy HH:mm"))
df = df.withColumn("shipping_date", to_timestamp(df["shipping_date"], "MM/dd/yyyy HH:mm"))


StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 41, Finished, Available)

In [40]:
display(df.filter(col('customer_state').rlike('^\\d+$')))

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 42, Finished, Available)

SynapseWidget(Synapse.DataFrame, c185924e-8b85-4634-a4f1-16e3990cb199)

## **Create new column - concatenation location**

In [41]:
df = df.withColumn("concat_destination_address", concat(col("order_city"), col("order_state"), col("order_country"), col("order_region"), col("market")))
df = df.withColumn("concat_customer_region", concat(col("customer_country"), col("customer_state"), col("customer_city"), col("customer_street")))

df = df.withColumnRenamed('product_card_id', 'product_id')

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 43, Finished, Available)

In [42]:
from pyspark.sql.functions import regexp_replace, ltrim
# filter out state which is a number
df = df.filter(~col('customer_state').rlike('^\\d+$')) 
# remove the number of customer_address
df = df.withColumn('customer_street', regexp_replace('customer_street', '[0-9]', ''))
df = df.withColumn('customer_street', ltrim(df['customer_street']))

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 44, Finished, Available)

## **Load cleaned_df into Bronze**

In [43]:
df.count()

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 45, Finished, Available)

178393

In [44]:
df.write.format("delta").mode("overwrite").saveAsTable('LTT_BronzeLakehouse.cleaned_df')

StatementMeta(, 4758991a-72dd-47fd-af3f-7774610f0451, 46, Finished, Available)