# Let's get this party started

In [1]:
#Dependencies
import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import re

# 1. Framing the Problem & Big Picture look (see readme file)

# 2. Get the Data

In [None]:
#set path to datasets
path = os.getcwd()
os.chdir("../datasets/")
filename = "sampledata.xlsx"
ser_path = "./serialized/"
filetwo = "sale_orders.xlsx"
filethree = "new_inventory.xlsx"

In [None]:
#Importing company datasets
#prod_hist = pd.read_excel(filename, sheet_name="production_hist")
#ship_hist = pd.read_excel(filename, sheet_name="shipment_hist")
#atp_hist = pd.read_excel(filename, sheet_name="atp_hist")
#inventory = pd.read_excel(filename, sheet_name="inventory")

In [None]:
#importing sales orders and new inventory
# order_hist = pd.read_excel(filetwo, sheet_name = "sale_orders")
# inv_hist2 = pd.read_excel(filethree, sheet_name = "new_inv")

In [None]:
# #setting file names
# data = [inventory, atp_hist, prod_hist, ship_hist]
# fnames = ["inventory", "atp_hist", "prod_hist","ship_hist"]
# count = 0
# for df in data:
#     df.name = fnames[count]
#     count+=1

In [None]:
# #saving dataframes into pickle files
# for df in data:
#     pklfile = df.name + ".pkl"
#     df.to_pickle(ser_path + pklfile)

In [None]:
# #setting file names
# data2 = [order_hist, inv_hist2]
# fnames = ["order_hist", "inv_hist2"]

# count = 0
# for df in data2:
#     df.name = fnames[count]
#     count+=1

# #saving new data into pickle files
# for df in data2:
#     pklfile = df.name + ".pkl"
#     df.to_pickle(ser_path + pklfile)

#saving dataframes into feather (not working)
for df in data:
    feafile = df.name
    df.to_feather(ser_path+feafile)

In [None]:
#see pklfile name order
pklfiles = os.listdir("./serialized")
pklfiles

In [None]:
#Fast Upload using pickle files
ship_hist = pd.read_pickle(ser_path+pklfiles[0])
inventory = pd.read_pickle(ser_path+pklfiles[1])
inv_hist2 = pd.read_pickle(ser_path+pklfiles[2])
order_hist = pd.read_pickle(ser_path+pklfiles[3])
atp_hist = pd.read_pickle(ser_path+pklfiles[4])
prod_hist = pd.read_pickle(ser_path+pklfiles[5])

## Cleaning Up

In [None]:
def clean_columns(dataframe):
    for col in dataframe.columns:
        dataframe.rename(columns={col:re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', col)}, inplace=True)
    dataframe.columns = dataframe.columns.str.strip().str.lower().str.replace(" ","_")
    try:
        dataframe["calendar_day"] = dataframe["calendar_day"].astype("datetime64")
    except:
        pass
    try:
        dataframe["requested_date"] = dataframe["requested_date"].astype("datetime64")
        dataframe["confirmed_date"] = dataframe["confirmed_date"].astype("datetime64")
        dataframe["material_avail_date"] = dataframe["material_avail_date"].astype("datetime64")
        dataframe["load_date"] = dataframe["load_date"].astype("datetime64")
        dataframe["plan_goods_issue_date"] = dataframe["plan_goods_issue_date"].astype("datetime64")
    except:
        pass
    try:
        dataframe["snapshot_date"] = dataframe["snapshot_date"].astype("datetime64")
    finally:
        return dataframe.columns

In [None]:
# attempt to get rid of nulls, need to be more specific with fills to not loose valuable insight. 
# data = [inventory, atp_hist, prod_hist, ship_hist]
# for df in data:
#     df = df.fillna(0, inplace=True)

### Inventory

In [None]:
clean_columns(inventory)

In [None]:
inventory.head(5)

In [None]:
inventory.info()

In [None]:
#Filling in empty atp rows with zero
inventory["atp"].fillna(0, inplace=True)

In [None]:
# inventory["work_center"] = inventory["work_center"].astype("int64")

In [None]:
inventory = inventory.loc[inventory["material_type"]=="ZERT"]
inventory["material_type"].unique()

In [None]:
inv_reduced = inventory[["calendar_day", "strgr", "material", "maktx", "work_center", "ph", "shape", "size_for_qual_cert", "block_resource", "tons", "atp"]]
inv_reduced.rename(columns={"material":"sku", "maktx":"material_description", "size_for_qual_cert":"size", "block_resource":"block", "strgr":"strategy"}, inplace=True)
inv_reduced = inv_reduced.sort_values(["calendar_day", "material_description"])
inv_reduced.head()

In [None]:
#changing dtype
inv_reduced["strategy"] = inv_reduced["strategy"].astype("category")
inv_reduced["sku"] = inv_reduced["sku"].astype("object")
inv_reduced["work_center"] = inv_reduced["work_center"].astype("category")

In [None]:
#view of new dtypes & counts
inv_reduced.info()

In [None]:
#peak at all nulls across
inv_reduced[inv_reduced.isna().any(axis=1)]

### ATP History

In [None]:
clean_columns(atp_hist)

In [None]:
atp_hist.info()

In [None]:
atp_reduced = atp_hist[["sku", "atp_floor", "snapshot_date"]]
atp_reduced.head()

In [None]:
atp_reduced.info()

### Production History

In [None]:
clean_columns(prod_hist)

In [None]:
prod_hist.info()

In [None]:
# prod_hist["stratedgy"] = prod_hist["stratedgy"].astype("int64")

In [None]:
prod_reduced = prod_hist[["calendar_day", "production_process", "production_process_description","stratedgy","work_center","wc_description","block","shape","size","material","material_description","ton"]]
prod_reduced.rename(columns={"material":"sku", "stratedgy":"strategy"}, inplace=True)
prod_reduced.head()

In [None]:
#Removing strategy 10 (Note: check assumption that these are all billets)
prod_reduced = prod_reduced.loc[(prod_reduced["strategy"] !=10)]

In [None]:
#Removing prod_processes up the supply chain: melt, scrap etc.
prod_reduced = prod_reduced.loc[(prod_reduced["production_process_description"] != "Melt shop") & (prod_reduced["production_process_description"] != "Scrap yard")]
prod_reduced = prod_reduced.loc[(prod_reduced["production_process_description"] != "Finishing production")]

In [None]:
#Identifying strategy=NaN rows
prod_reduced[prod_reduced["strategy"].isna()]

In [None]:
#Identifying size=NaN rows
#prod_reduced[prod_reduced["size"].isna()]

In [None]:
#Dropping selective NaN rows
prod_reduced.dropna(subset=["strategy"], inplace=True)

In [None]:
#Identifying unique inputs by column
unique_counts = pd.DataFrame.from_records([(col, prod_reduced[col].nunique()) for col in prod_reduced.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
unique_counts

In [None]:
# Changing data types for colums, 
for col in prod_reduced.columns:
    if prod_reduced[col].nunique() < 10:
        prod_reduced[col] = prod_reduced[col].astype("category")
# not including "block" as it used as object elsewhere

prod_reduced["sku"] =prod_reduced["sku"].astype("object")

In [None]:
prod_reduced.info()

In [None]:
prod_reduced[prod_reduced["block"].isna()]["production_process_description"]

In [None]:
prod_reduced[prod_reduced["block"].isna()].groupby("production_process_description").count()

### Ship History

In [None]:
clean_columns(ship_hist)

In [None]:
ship_hist.info()

In [None]:
#Filling in NaN Shipment field rows to zero
ship_hist["shipment_tons"].fillna(0, inplace=True)

In [None]:
# Attempt to get correct data types, needs work.
# ship_hist["businessdaycounter"] = ship_hist["businessdaycounter"].astype("int64")
# ship_hist["strgr"] = ship_hist["strgr"].astype("int64")
# ship_hist["workcenter"] = ship_hist["workcenter"].astype("int64")

In [None]:
#Reducing main data frame
ship_reduced = ship_hist[["shipping_day", "block", "strgr", "shape", "sizeforqualcert", "div", "material", "material_description","shipment_tons", "segment", "sold-to_party","ship-to_party", "postal_code","region_description", "country"]]
ship_reduced.rename(columns={"material":"sku", "strgr":"strategy","sizeforqualcert":"size"}, inplace=True)
ship_reduced.head()

In [None]:
#Check unique Divisions
ship_hist["div"].unique()

In [None]:
#Removing billets and scrap from dataframe
ship_reduced = ship_reduced.loc[(ship_reduced["div"] != "Billets") & (ship_reduced["div"] != "Scrap")]

In [None]:
ship_reduced.info()

In [None]:
#Show Sample of NaN fields
ship_reduced[ship_reduced.isna().any(axis=1)]

In [None]:
#Show the shape that show as null within the block
ship_reduced[ship_reduced["block"].isna()].groupby("shape").count()

### Sales Order Data

In [None]:
clean_columns(order_hist)

In [None]:
#Reducing main data frame
order_reduced = order_hist[["requested_date",'confirmed_date','material_avail_date', 'load_date', 'plan_goods_issue_date','sales_district_code',
                            "division", "sold_to", "ship_to","route","ship_to_city", "ship_to_state", "material_number",'material_description',
                            'order_qty_sales_units', 'confirmed_qty', 'sales_uom','qty_on_shipment','qty_shipped_sales_units', 'item_status',]]
order_reduced.rename(columns={"material_number":"sku"}, inplace=True)
order_reduced.head(-5)

### New Inventory Data (dating back 2016)

### Cleaned Reduced Datasets

In [None]:
data_reduced = [ship_reduced, inv_reduced, atp_reduced, prod_reduced, order_reduced]

In [None]:
#storing cleaned Prod
%store prod_reduced

#storing cleaned Ship
%store ship_reduced

#storing cleaned Inventory
%store inv_reduced

#storing cleaned ATP
%store atp_reduced

#storing orders
%store order_reduced