In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType 
from pyspark.sql.functions import *

from datetime import date, datetime, timedelta
import boto3

In [None]:
spark = SparkSession.builder \
    .appName("sql") \
    .getOrCreate()

In [None]:
def read_parquet_s3(app, s3_path):
    
    df = app.read.parquet(s3_path)
    path_sinature = ">> Parquet file read from " + s3_path
    
    print("{:<32}".format(path_sinature) + '\n')
    
    return df


def write_parquet_s3(spark_df, bucket, file_path):
    """Writing spark Dataframe into s3 as a parquet file.
    
    Parameters:
    spark_df (pyspark.sql.dataframe): the spark Dataframe.
    bucket (string): the s3 bucket name.
    file_path (string) : the table name or directory.
    
    Returns:
    """
    s3_path = 's3://{}/{}'.format(bucket, file_path)
    spark_df.write.parquet(s3_path, mode="overwrite")
    
    print(">> Parquet file written on {}".format(s3_path))

## Params

In [None]:
first_week_id = 201601
first_date = '2016-01-03'
purch_org = 'Z001'
sales_org = 'Z002'
bucket = 's3://fcst-workspace/Z21GABDO/'

## Load all needed clean data

In [None]:
f_transaction_detail = read_parquet_s3(spark, bucket + 'f_transaction_detail/201912/')
f_delivery_detail = read_parquet_s3(spark, bucket + 'f_delivery_detail/201912/')

d_sku = read_parquet_s3(spark, bucket + 'd_sku/')
d_business_unit = read_parquet_s3(spark, bucket + 'd_business_unit/')

sapb = read_parquet_s3(spark, bucket + 'sites_attribut_0plant_branches_h/')
sales_data_material = read_parquet_s3(spark, bucket + 'sales_data_material/')
d_sales_data_material = read_parquet_s3(spark, bucket + 'd_sales_data_material_h/')

d_day = read_parquet_s3(spark, bucket + 'd_day/')
d_week = read_parquet_s3(spark, bucket + 'd_week/')

## Create Actual_sales

In [None]:
actual_sales_offline = f_transaction_detail \
    .join(d_day,
          on=to_date(f_transaction_detail.tdt_date_to_ordered, 'yyyy-MM-dd') == d_day.day_id_day,
          how='inner') \
    .join(d_week, 
          on=d_day.wee_id_week == d_week.wee_id_week, 
          how='inner') \
    .join(d_sku, 
          on=f_transaction_detail.sku_idr_sku == d_sku.sku_idr_sku, 
          how='inner') \
    .join(d_business_unit, 
          on=f_transaction_detail.but_idr_business_unit == d_business_unit.but_idr_business_unit, 
          how='inner') \
    .join(sapb,
          on=d_business_unit.but_num_business_unit == sapb.plant_id.cast('int'), 
          how='inner') \
    .filter(f_transaction_detail.the_to_type == 'offline') \
    .filter(f_transaction_detail.f_qty_item > 0) \
    .filter(d_week.wee_id_week >= first_week_id) \
    .filter(~d_sku.unv_num_univers.isin([0, 14, 89, 90])) \
    .filter(sapb.purch_org == purch_org) \
    .filter(sapb.sapsrc == 'PRT') \
    .select(d_week.wee_id_week.alias('week_id'),
            d_week.day_first_day_week.alias('date'),
            d_sku.mdl_num_model_r3.alias('model'),
            f_transaction_detail.f_qty_item.cast('int')) \
    .groupby(['week_id', 'date', 'model']) \
    .agg(sum('f_qty_item').alias('y_off'))

In [None]:
actual_sales_online = f_delivery_detail \
    .join(d_day,
          on=to_date(f_delivery_detail.tdt_date_to_ordered, 'yyyy-MM-dd') == d_day.day_id_day,
          how='inner') \
    .join(d_week, 
          on=d_day.wee_id_week == d_week.wee_id_week, 
          how='inner') \
    .join(d_sku, 
          on=f_delivery_detail.sku_idr_sku == d_sku.sku_idr_sku, 
          how='inner') \
    .join(d_business_unit, 
          on=f_delivery_detail.but_idr_business_unit_economical == d_business_unit.but_idr_business_unit, 
          how='inner') \
    .join(sapb,
          on=d_business_unit.but_num_business_unit == sapb.plant_id.cast('int'), 
          how='inner') \
    .filter(f_delivery_detail.the_to_type == 'online') \
    .filter(f_delivery_detail.f_qty_item > 0) \
    .filter(d_week.wee_id_week >= first_week_id) \
    .filter(~d_sku.unv_num_univers.isin([0, 14, 89, 90])) \
    .filter(sapb.purch_org == purch_org) \
    .filter(sapb.sapsrc == 'PRT') \
    .select(d_week.wee_id_week.alias('week_id'),
            d_week.day_first_day_week.alias('date'),
            d_sku.mdl_num_model_r3.alias('model'),
            f_delivery_detail.f_qty_item.cast('int')) \
    .groupby(['week_id', 'date', 'model']) \
    .agg(sum('f_qty_item').alias('y_on'))

In [None]:
actual_sales = actual_sales_offline \
    .join(actual_sales_online, on=['week_id', 'date', 'model']) \
    .withColumn('y', actual_sales_offline.y_off + actual_sales_online.y_on) \
    .select(['week_id', 'date', 'model', 'y'])

In [None]:
#actual_sales.show(3)
#actual_sales.printSchema()

## Create Lifestage_update

In [None]:
lifestage_update = d_sales_data_material \
    .join(d_sku, 
          on=d_sales_data_material.material_id.cast('int') == d_sku.sku_num_sku_r3,
          how='inner') \
    .filter(d_sales_data_material.sales_org == sales_org) \
    .filter(d_sales_data_material.sap_source == 'PRT') \
    .filter(d_sales_data_material.lifestage != '') \
    .filter(d_sales_data_material.distrib_channel == '02') \
    .filter(d_sales_data_material.date_end >= first_date) \
    .filter(~d_sku.unv_num_univers.isin([0, 14, 89, 90])) \
    .withColumn("date_end", 
                when(d_sales_data_material.date_end == '2999-12-31', to_date(lit('2100-12-31'), 'yyyy-MM-dd')) \
                    .otherwise(d_sales_data_material.date_end)) \
    .select(d_sku.mdl_num_model_r3.alias('model'), 
            d_sku.sku_num_sku_r3.alias('sku'),
            d_sales_data_material.date_begin,
            "date_end",
            d_sales_data_material.lifestage.cast('int').alias('lifestage')) \
    .drop_duplicates()

In [None]:
#lifestage_update.show(3)
#lifestage_update.printSchema()

## Create Model_info

In [None]:
model_info = sales_data_material \
    .join(d_sku, 
          on=sales_data_material.material_id.cast('int') == d_sku.sku_num_sku_r3,
          how='inner') \
    .filter(sales_data_material.sales_org == sales_org) \
    .filter(sales_data_material.sap_source == 'PRT') \
    .filter(sales_data_material.lifestage != '') \
    .filter(sales_data_material.distrib_channel == '02') \
    .filter(~d_sku.unv_num_univers.isin([0, 14, 89, 90])) \
    .filter(current_date().between(d_sku.sku_date_begin, d_sku.sku_date_end)) \
    .select(d_sku.mdl_num_model_r3.alias('model'),
            d_sku.mdl_label.alias('model_label'),
            d_sku.fam_num_family.alias('family'),
            d_sku.family_label.alias('family_label'),
            d_sku.sdp_num_sub_department.alias('sub_department'),
            d_sku.sdp_label.alias('sub_department_label'),
            d_sku.dpt_num_department.alias('department'),
            d_sku.unv_label.alias('department_label'),
            d_sku.unv_num_univers.alias('univers'),
            d_sku.mdl_label.alias('univers_label'),
            d_sku.pnt_num_product_nature.alias('product_nature'),
            d_sku.product_nature_label.alias('product_nature_label'),
            d_sku.category_label.alias('category_label'),
            sales_data_material.assortment_grade.alias('range_level')) \
    .drop_duplicates()

In [None]:
#model_info.show(3)
#model_info.printSchema()

## Write parquet

In [None]:
write_parquet_s3(actual_sales, 'fcst-refined-demand-forecast-dev', 'part_1_1/actual_sales')
write_parquet_s3(lifestage_update, 'fcst-refined-demand-forecast-dev', 'part_1_1/lifestage_update')
write_parquet_s3(model_info, 'fcst-refined-demand-forecast-dev', 'part_1_1/model_info')

In [None]:
spark.stop()