In [None]:
import os
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from pyspark.sql.functions import datediff,current_date,to_date,date_format,count,col,when
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, DoubleType
from pyspark.sql.types import *

In [None]:
os.environ["SPARK_LOCAL_IP"] = "127.0.0.1"

In [None]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.0.0") \
        .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')\
        .config("spark.hadoop.fs.s3a.access.key", 'AKIA3AEXDSNEGXQERCGG') \
        .config("spark.hadoop.fs.s3a.secret.key", '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') \
        .appName('Report 2 : SC Supply Chain Report')\
        .getOrCreate()

In [None]:
itemSchema = StructType([
    StructField("No", StringType(), True),
    StructField("No_ 2", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Search Description", StringType(), True),
    StructField("Description 2", StringType(), True),
    StructField("Base Unit of Measure", StringType(), True),
    StructField("Price Unit Conversion", StringType(), True),
    StructField("Type", StringType(), True),
    StructField("Inventory Posting Group", StringType(), True),
    StructField("Shelf No_", StringType(), True),
    StructField("Item Disc_ Group", StringType(), True),
    StructField("Allow Invoice Disc_", StringType(), True),
    StructField("Statistics Group", StringType(), True),
    StructField("Commission Group", StringType(), True),
    StructField("Unit Price", IntegerType(), True),
    StructField("Price_Profit Calculation", StringType(), True),
    StructField("Profit _", StringType(), True),
    StructField("Costing Method", StringType(), True),
    StructField("Unit Cost", StringType(), True),
    StructField("Standard Cost", StringType(), True),
    StructField("Quoted Price(INR)", StringType(), True),
    StructField("Quoted Price(FCY)", StringType(), True),
    StructField("Quoted Currency", StringType(), True),
    StructField("Standard Cost_", StringType(), True),
    StructField("Production_BOM_No", StringType(), True),
])

In [None]:
item_df = spark.read.parquet("s3a://hackathon2023/data/SCSupplyChain/item/item.parquet", inferSchema=True)
for col in item_df.columns:
    item_df = item_df.withColumnRenamed(col, [f.name for f in itemSchema.fields if f.name != col][0])
item_df = spark.createDataFrame(item_df.rdd, itemSchema)

In [None]:
item_df=item_df.drop('No_ 2')
item_df=item_df.drop('Description 2')
item_df=item_df.drop('Search Description')
item_df=item_df.drop('Type')
item_df=item_df.na.drop()

In [None]:
warehouseSchema=StructType([
    StructField("Entry No", StringType(), True),
    StructField("Journal Batch Name", StringType(), True),
    StructField("Line No_", StringType(), True),
    StructField("Registering Date", StringType(), True),
    StructField("Location Code", StringType(), True),
    StructField("Zone Code", StringType(), True),
    StructField("Bin Code", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Item No_", StringType(), True),
    StructField("Quantity", StringType(), True),
    StructField("Qty_ (Base)", StringType(), True),
    StructField("Source Type", StringType(), True),
    StructField("Source Subtype", StringType(), True),
    StructField("Source No_", StringType(), True),
    StructField("Source Line No_", StringType(), True),
    StructField("Source Subline No_", StringType(), True),
    StructField("Source Document", StringType(), True),
    StructField("Source Code", StringType(), True),
    StructField("Reason Code", StringType(), True),
    StructField("No_ Series", StringType(), True),
    StructField("Bin Type Code", StringType(), True),
    StructField("Cubage", StringType(), True),
    StructField("Weight", StringType(), True),
    StructField("Journal Template Name", StringType(), True),
    StructField("Whse_ Document No_", StringType(), True),
    StructField("Whse_ Document Type", StringType(), True),
    StructField("Whse_ Document Line No_", StringType(), True),
    StructField("Entry Type", StringType(), True),
    StructField("Reference Document", StringType(), True),
    StructField("Reference No_", StringType(), True),
    StructField("User ID", StringType(), True),
    StructField("Variant Code", StringType(), True),
    StructField(" Qty_ per Unit of Measure", StringType(), True),
    StructField("Unit of Measure Code", StringType(), True),
    StructField("Serial No_", StringType(), True),
    StructField("Lot No_", StringType(), True),
    StructField("Warranty Date", StringType(), True),
    StructField("Expiration Date", StringType(), True),
    StructField("Phys Invt Counting Period Code", StringType(), True),
    StructField("Phys Invt Counting Period Type", StringType(), True),
    StructField("Dedicated", StringType(), True),
    StructField("Company", StringType(), True),
    StructField("Division", StringType(), True),
])

In [None]:
warehouse_df=spark.read.format("csv").option("header", "false").option("delimiter",",").schema(warehouseSchema).load("s3a://hackathon2023/data/SCSupplyChain/warehouse/warehouse.csv")

In [None]:
warehouse_df = warehouse_df.withColumn("Registering Date", to_date(warehouse_df["Registering Date"], "dd-MM-yyyy"))
warehouse_df=warehouse_df.drop('Journal Batch Name')
warehouse_df=warehouse_df.drop('Reason Code')
warehouse_df=warehouse_df.drop('Journal Template Name')
warehouse_df=warehouse_df.drop('Variant Code')
warehouse_df=warehouse_df.drop('Serial No_')
warehouse_df=warehouse_df.drop('Company')
warehouse_df=warehouse_df.drop('Division')
warehouse_df=warehouse_df.drop('Qty_ (Base)')

In [None]:
production_df=spark.read.format("csv").option("header","True").option("delimiter","\t").load("s3a://hackathon2023/data/SCSupplyChain/production/production.txt")

In [None]:
production_df=production_df.drop('Version Code')
production_df=production_df.drop('Position 2')
production_df=production_df.drop('Position 3')
production_df=production_df.drop('Company')
production_df=production_df.drop('Division')
production_df=production_df.na.drop()

In [None]:
from pyspark.sql import SparkSession, functions as F

In [None]:
df_1 = warehouse_df.groupBy("Lot No_", "Bin Code", "Item No_","Registering Date").agg(
    F.min("Registering Date").alias("min_registering_date"),
    F.sum("Quantity").alias("sum_quantity"),
    F.first("Zone Code").alias("first_zone_code"),
    F.datediff(F.current_date(), F.col("Registering Date")).alias("date_diff")
).filter("sum_quantity > 0")


In [None]:
df_2 = item_df.filter("Production_BOM_No != ''").select("No", "Production_BOM_No").union(
    production_df.select("No_", "Production BOM No_")
).distinct()

In [None]:
df_3 = df_1.join(df_2, df_1["Item No_"] == df_2["No"], "left").drop("No")

In [None]:
df_3=df_3.drop('Production_BOM_No')
df_4 = df_3.join(item_df, df_3["Item No_"] == item_df["No"], "inner")
df_4=df_4.drop('No')
df_4=df_4.drop('min_registering_date')
df_4=df_4.drop('date_diff')
df_4=df_4.drop('first_zone_code')

In [None]:
inventory_value_by_category = df_4.groupBy("Item Disc_ Group").agg(F.sum("Unit Price").alias("Inventory Value")) 
inventory_value_by_category.show()

In [None]:
inventory_value_by_category = df_4.groupBy("Item Disc_ Group").agg(F.sum("Unit Price").alias("Inventory Value"))
top_10_categories = inventory_value_by_category.sort("Inventory Value", ascending=False).limit(10)
top_10_categories.show()

In [None]:
df_4 = df_4.withColumn("Age", datediff(current_date(), to_date("Registering Date", "yyyy-MM-dd")))
inventory_value_by_age = df_4.groupBy("Age").agg(F.sum("Unit Price").alias("Inventory Value"))
inventory_value_by_age.show()

In [None]:
bin_value = df_4.groupBy("Bin Code").agg(F.sum("Unit Price").alias("sum(value)")).withColumnRenamed("sum(value)", "Value")
bin_value.show()