In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [None]:
#conf = SparkConf().setAll([
# ('spark.sql.shuffle.partitions', 110),
# ('spark.default.parallelism', 110),
# ('spark.autoBroadcastJoinThreshold', 15485760),
# ('spark.dynamicAllocation.enabled', 'false'),
# ('spark.executor.instances', 11),
# ('spark.executor.memory', '36g'),
# ('spark.driver.memory', '36g'),
# ('spark.driver.cores', 5),
# ('spark.memory.storageFraction', 0.1),
# ('spark.memory.fraction', 0.9),
# ('spark.executor.memoryOverhead', '4g'),
# ('spark.executor.cores', 5),
# ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', 2)
#])
#
#spark = SparkSession.builder \
#    .config(conf=conf) \
#    .appName("sql") \
#    .getOrCreate()
#for x in spark.sparkContext.getConf().getAll() :
#    print(x)

In [None]:
def read_parquet_s3(app, s3_path):
    
    df = app.read.parquet(s3_path)
    path_sinature = ">> Parquet file read from " + s3_path
    
    print("{:<32}".format(path_sinature) + '\n')
    
    return df


def write_parquet_s3(spark_df, bucket, file_path):
    """Writing spark Dataframe into s3 as a parquet file.
    
    Parameters:
    spark_df (pyspark.sql.dataframe): the spark Dataframe.
    bucket (string): the s3 bucket name.
    file_path (string) : the table name or directory.
    
    Returns:
    """
    s3_path = 's3://{}/{}'.format(bucket, file_path)
    spark_df.write.parquet(s3_path, mode="overwrite")
    
    print(">> Parquet file written on {}".format(s3_path))

## Params

In [None]:
first_week_id = 201501
first_date = '2015-01-01'#2014-12-28
purch_org = 'Z001'
sales_org = 'Z002'
bucket = 's3://fcst-clean-dev/'

## Load all needed clean data

In [None]:
tdt = read_parquet_s3(spark, bucket + 'f_transaction_detail/*/')
dyd = read_parquet_s3(spark, bucket + 'f_delivery_detail/*/')

sku = read_parquet_s3(spark, bucket + 'd_sku/')
bu = read_parquet_s3(spark, bucket + 'd_business_unit/')

sapb = read_parquet_s3(spark, bucket + 'sites_attribut_0plant_branches_h/')
sdm = read_parquet_s3(spark, bucket + 'sales_data_material/')
sdmh = read_parquet_s3(spark, bucket + 'd_sales_data_material_h/')

day = read_parquet_s3(spark, bucket + 'd_day/')
week = read_parquet_s3(spark, bucket + 'd_week/')

## Create Actual_sales

In [None]:
actual_sales_offline = tdt \
    .join(day,
          on=F.to_date(tdt.tdt_date_to_ordered, 'yyyy-MM-dd') == day.day_id_day,
          how='inner') \
    .join(week, 
          on=day.wee_id_week == week.wee_id_week, 
          how='inner') \
    .join(sku,
          on=tdt.sku_idr_sku == sku.sku_idr_sku, 
          how='inner') \
    .join(bu, 
          on=tdt.but_idr_business_unit == bu.but_idr_business_unit, 
          how='inner') \
    .join(sapb,
          on=bu.but_num_business_unit.cast('string') == \
             F.regexp_replace(sapb.plant_id, '^0*|\s',''),
          how='inner') \
    .filter(tdt.the_to_type == 'offline') \
    .filter(week.wee_id_week >= first_week_id) \
    .filter(~sku.unv_num_univers.isin([0, 14, 89, 90])) \
    .filter(sku.mdl_num_model_r3.isNotNull()) \
    .filter(sapb.purch_org == purch_org) \
    .filter(sapb.sapsrc == 'PRT') \
    .filter(F.current_timestamp().between(sapb.date_begin, sapb.date_end)) \
    .select(week.wee_id_week.alias('week_id'),
            week.day_first_day_week.alias('date'),
            sku.mdl_num_model_r3.alias('model'),
            tdt.f_qty_item)

In [None]:
actual_sales_online = dyd \
    .join(day,
          on=F.to_date(dyd.tdt_date_to_ordered, 'yyyy-MM-dd') == day.day_id_day,
          how='inner') \
    .join(week, 
          on=day.wee_id_week == week.wee_id_week, 
          how='inner') \
    .join(sku, 
          on=dyd.sku_idr_sku == sku.sku_idr_sku, 
          how='inner') \
    .join(bu, 
          on=dyd.but_idr_business_unit_economical == bu.but_idr_business_unit, 
          how='inner') \
    .join(sapb,
          on=bu.but_num_business_unit.cast('string') == \
             F.regexp_replace(sapb.plant_id, '^0*|\s',''),
          how='inner') \
    .filter(dyd.the_to_type == 'online') \
    .filter(week.wee_id_week >= first_week_id) \
    .filter(~sku.unv_num_univers.isin([0, 14, 89, 90])) \
    .filter(sku.mdl_num_model_r3.isNotNull()) \
    .filter(sapb.purch_org == purch_org) \
    .filter(sapb.sapsrc == 'PRT') \
    .filter(F.current_timestamp().between(sapb.date_begin, sapb.date_end)) \
    .select(week.wee_id_week.alias('week_id'),
            week.day_first_day_week.alias('date'),
            sku.mdl_num_model_r3.alias('model'),
            dyd.f_qty_item)

In [None]:
actual_sales = actual_sales_offline.union(actual_sales_online) \
    .groupby(['week_id', 'date', 'model']) \
    .agg(F.sum('f_qty_item').alias('y')) \
    .filter(F.col('y') > 0)

In [None]:
#actual_sales.show(3)
#actual_sales.printSchema()

## Create Lifestage_update

In [None]:
lifestage_update = sdmh \
    .join(sku, 
          on=F.regexp_replace(sdmh.material_id, '^0*|\s','') == \
             sku.sku_num_sku_r3.cast('string'),
          how='inner') \
    .filter(sdmh.sales_org == sales_org) \
    .filter(sdmh.sap_source == 'PRT') \
    .filter(sdmh.lifestage != '') \
    .filter(sdmh.distrib_channel == '02') \
    .filter(sdmh.date_end >= first_date) \
    .filter(sku.mdl_num_model_r3.isNotNull()) \
    .filter(~sku.unv_num_univers.isin([0, 14, 89, 90])) \
    .filter(F.current_timestamp().between(sku.sku_date_begin, sku.sku_date_end)) \
    .withColumn("date_end", 
                F.when(sdmh.date_end == '2999-12-31', 
                       F.to_date(F.lit('2100-12-31'), 'yyyy-MM-dyd')) \
                       .otherwise(sdmh.date_end)) \
    .select(sku.mdl_num_model_r3.alias('model'), 
            sku.sku_num_sku_r3.alias('sku'),
            sdmh.date_begin,
            "date_end",
            sdmh.lifestage.cast('int').alias('lifestage')) \
    .drop_duplicates()

In [None]:
#lifestage_update.show(3)
#lifestage_update.printSchema()

## Create Model_info

In [None]:
model_info = sdm \
    .join(sku, 
          on=F.regexp_replace(sdm.material_id, '^0*|\s','') == \
             sku.sku_num_sku_r3.cast('string'),
          how='inner') \
    .filter(sdm.sales_org == sales_org) \
    .filter(sdm.sap_source == 'PRT') \
    .filter(sdm.assortment_grade != '') \
    .filter(sdm.distrib_channel == '02') \
    .filter(sku.mdl_num_model_r3.isNotNull()) \
    .filter(~sku.unv_num_univers.isin([0, 14, 89, 90])) \
    .filter(F.current_timestamp().between(sku.sku_date_begin, sku.sku_date_end)) \
    .select(sku.mdl_num_model_r3.alias('model'),
            sku.mdl_label.alias('model_label'),
            sku.fam_num_family.alias('family'),
            sku.family_label.alias('family_label'),
            sku.sdp_num_sub_department.alias('sub_department'),
            sku.sdp_label.alias('sub_department_label'),
            sku.dpt_num_department.alias('department'),
            sku.unv_label.alias('department_label'),
            sku.unv_num_univers.alias('univers'),
            sku.mdl_label.alias('univers_label'),
            sku.pnt_num_product_nature.alias('product_nature'),
            sku.product_nature_label.alias('product_nature_label'),
            sku.category_label.alias('category_label'),
            sdm.assortment_grade.alias('range_level')) \
    .drop_duplicates()

In [None]:
#model_info.show(3)
#model_info.printSchema()

## Write parquet

In [None]:
actual_sales.cache()
model_info.cache()
lifestage_update.cache()

In [None]:
print(actual_sales.count())
print(lifestage_update.count())
print(model_info.count())

In [None]:
actual_sales.describe().show()

In [None]:
lifestage_update.describe().show()

In [None]:
model_info.describe().show()

In [None]:
write_parquet_s3(actual_sales, 'fcst-refined-demand-forecast-dev', 'part_1_1/actual_sales')
write_parquet_s3(lifestage_update, 'fcst-refined-demand-forecast-dev', 'part_1_1/lifestage_update')
write_parquet_s3(model_info, 'fcst-refined-demand-forecast-dev', 'part_1_1/model_info')

In [None]:
spark.stop()