In [None]:
import os
import boto3

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType 
from pyspark.sql.functions import col
from datetime import date, datetime, timedelta
from pyspark.sql.functions import *


In [None]:
spark = SparkSession.builder \
    .appName("sql") \
    .getOrCreate()

In [None]:
def read(app, path_s3):
    
    # Function's signature for printing purpose
    func_signature = "\n------> Using function: read_parquet"
    print("{:<32}".format(func_signature) + '\n')    
    
    df = app.read.parquet(path_s3)
    path_sinature = "\n------> Read from: " + path_s3
    print("{:<32}".format(path_sinature) + '\n')    
    
    # df.limit(1).show()
    # df.printSchema()
    
    return df


def export_parquet(spark_df, bucket_name, table_path):
    """Writing spark Dataframe into s3 as a parquet file.
    
    Parameters:
    df (pyspark.sql.dataframe): the spark Dataframe.
    bucket_name (string): the s3 bucket name.
    table_path (string) : the table name or directory.
    
    Returns:
    """
    # Function's signature for printing purpose
    func_signature = "\n------> Using function: export_parquet"
    print("{:<32}".format(func_signature) + '\n')
    
    s3_writing_path = 's3://{}/{}'.format(bucket_name, table_path)
    print("\t| Writing parquet to path: {}".format(s3_writing_path))
    
    spark_df.write.parquet(s3_writing_path, mode="overwrite")

In [None]:
df_f_transaction_detail = read(spark, 's3://fcst-clean-dev/f_transaction_detail/201912/')
df_f_delivery_detail = read(spark, 's3://fcst-clean-dev/f_delivery_detail/201912/')

df_d_sku = read(spark, 's3://fcst-clean-dev/d_sku/')
df_d_business_unit = read(spark, 's3://fcst-clean-dev/d_business_unit/')
df_d_day = read(spark, 's3://fcst-clean-dev/d_day/')
site = read(spark, 's3://fcst-clean-dev/sites_attribut_0plant_branches_h/')
sales_data_material = read(spark, 's3://fcst-clean-dev/d_sales_data_material_h/')
df_d_week = read(spark, 's3://fcst-clean-dev/d_week/')

## Actual_sales

In [None]:
site = site.withColumn('but_num', site['plant_id'].cast('int'))

In [None]:
bu_europe = df_d_business_unit.join(site, [(site.but_num == df_d_business_unit.but_num_business_unit)], how='inner')\
                       .where(df_d_business_unit.but_num_typ_but == 7) \
                       .where(site.purch_org == 'Z001')\
                       .where(site.sapsrc == 'PRT')\
                       .select(df_d_business_unit.but_idr_business_unit, df_d_business_unit.but_num_business_unit, site.but_num)\
                       .distinct()

In [None]:
sku = df_d_sku.filter(~col('unv_num_univers').isin([0, 14, 89, 90])) \
              .select(['mdl_num_model_r3', 'sku_idr_sku']) \
              .distinct()

In [None]:
offline_turnover_detail = df_f_transaction_detail.join(sku, [(sku.sku_idr_sku == df_f_transaction_detail.sku_idr_sku) ], how='inner') \
                                                 .join(bu_europe, [(bu_europe.but_idr_business_unit == df_f_transaction_detail.but_idr_business_unit) ], how='inner')  \
                                                 .join(df_d_day, [(to_date(df_d_day.day_id_day,'yyyy-MM-dd') == to_date(df_f_transaction_detail.tdt_date_to_ordered,'yyyy-MM-dd')) ], how='inner') \
                                                 .where(df_f_transaction_detail.the_to_type == 'offline')\
                                                 .select(df_d_day.wee_id_week.alias('week_id'), sku.mdl_num_model_r3.alias('model'), df_f_transaction_detail.f_qty_item.cast('int').alias('f_qty_item_off'))\
                                                 .where(df_d_day.wee_id_week >= '201911') 

In [None]:
offline_turnover_detail.show()

In [None]:
# agg offline

agg_offline = offline_turnover_detail.groupBy(['week_id', 'model'])\
                                     .agg(sum('f_qty_item_off').alias('y_off'))

agg_offline.show()

In [None]:
# online_turnover need delevery detail

online_turnover_detail = df_f_delivery_detail.join(sku, [(sku.sku_idr_sku == df_f_delivery_detail.sku_idr_sku) ], how='inner') \
                                             .join(bu_europe, [(bu_europe.but_idr_business_unit == df_f_delivery_detail.but_idr_business_unit_economical) ], how='inner')  \
                                             .join(df_d_day, [(to_date(df_d_day.day_id_day,'yyyy-MM-dd') == to_date(df_f_delivery_detail.tdt_date_to_ordered,'yyyy-MM-dd')) ], how='inner') \
                                             .where(df_f_delivery_detail.the_to_type == 'online')\
                                             .select(df_d_day.wee_id_week.alias('week_id'), sku.mdl_num_model_r3.alias('model'), df_f_delivery_detail.f_qty_item.cast('int').alias('f_qty_item_on'))\
                                             .where(df_d_day.wee_id_week >= '201911')

In [None]:
# agg online

agg_online = online_turnover_detail.groupBy(['week_id', 'model'])\
                                   .agg(sum('f_qty_item_on').alias('y_on'))


agg_online.show()

In [None]:
# agg offline + agg online

temp_actual_sales = agg_online.join(agg_offline, [(agg_offline.week_id == agg_online.week_id) & (agg_offline.model == agg_online.model)], how='full')\
                              .select(coalesce(agg_offline.week_id, agg_online.week_id).alias('week_id'),\
                                      coalesce(agg_offline.model, agg_online.model).alias('model'),\
                                      coalesce(agg_offline.y_off, lit(0)).alias('y_off'), \
                                      coalesce(agg_online.y_on,lit(0)).alias('y_on'))
              
temp_actual_sales = temp_actual_sales.withColumn('y', temp_actual_sales.y_off+temp_actual_sales.y_on)

In [None]:
actual_sales = temp_actual_sales.select(temp_actual_sales.week_id, temp_actual_sales.model, temp_actual_sales.y)

In [None]:
actual_sales = actual_sales.join(df_d_week, [(df_d_week.wee_id_week == actual_sales.week_id)], how='inner')\
                           .select(actual_sales.week_id, df_d_week.day_first_day_week.alias('date'), actual_sales.model, actual_sales.y)

In [None]:
actual_sales.show()

## Lifesatge_update

In [None]:
sales_data_material = sales_data_material.where(sales_data_material.sales_org == 'Z001')\
                                         .where(sales_data_material.sap_source == 'PRT')\
                                         .where(sales_data_material.lifestage != '')\
                                         .where(sales_data_material.distrib_channel == '02')\
                                         .where(sales_data_material.date_end >= '2015-01-01')\
                                         .select(sales_data_material.material_id, sales_data_material.lifestage, sales_data_material.date_begin, sales_data_material.date_end)\
                                         .withColumn('sku_num', sales_data_material['material_id'].cast('int'))\
                                         .withColumn("d_end",when(sales_data_material.date_end == '2999-12-31', '2100-12-31').otherwise(sales_data_material.date_end))\
                                         .distinct()

In [None]:
sales_data_material.show(2)

In [None]:
sku_ls = df_d_sku.where(df_d_sku.sku_date_end == '2999-12-31 23:59:59') \
                 .filter(~col('unv_num_univers').isin([0, 14, 89, 90])) \
                 .where(df_d_sku.sku_date_begin <= date.today()) \
                 .where(df_d_sku.sku_date_end >= date.today()) \
                 .select(['sku_num_sku_r3', 'mdl_num_model_r3']) \
                 .distinct()

In [None]:
lifestage_update = sales_data_material.join(sku_ls, [(sku_ls.sku_num_sku_r3 == sales_data_material.sku_num)])\
                                      .select(sku_ls.mdl_num_model_r3.alias('model'), sku_ls.sku_num_sku_r3.alias('sku'), sales_data_material.date_begin, sales_data_material.d_end.alias('date_end'), sales_data_material.lifestage)\
                                      .distinct()

In [None]:
lifestage_update.show()

## Export parquet

In [None]:
export_parquet(lifestage_update, 'fcst-refined-demand-forecast-dev', 'part_1/lifestage_update')
export_parquet(actual_sales, 'fcst-refined-demand-forecast-dev', 'part_1/actual_sales')

In [None]:
spark.stop()