# Exploring Merged File

In [44]:
#Dependencies
import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime as dt

In [45]:
os.getcwd()

'/Users/jacosta3/OneDrive - University of South Florida/Gerdau Projects/pft/datasets'

In [46]:
#change path to datasets folder
os.chdir("../datasets/")
#read merged.pkl
merged_df = pd.read_pickle("./serialized/merged.pkl")

In [47]:
#create file paths
ser_path = "./serialized/"
merged_path = "./merged_clean/"

In [48]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 814243 entries, 0 to 814242
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   calendar_day              814243 non-null  datetime64[ns]
 1   sku                       814243 non-null  object        
 2   size                      811071 non-null  object        
 3   block                     813827 non-null  object        
 4   material_description      813086 non-null  object        
 5   shape                     810269 non-null  object        
 6   strategy                  805552 non-null  object        
 7   steel_grade               810269 non-null  object        
 8   length_text               808869 non-null  object        
 9   work_center               814189 non-null  object        
 10  ph                        813878 non-null  object        
 11  IN_tons                   774593 non-null  float64       
 12  SO

In [49]:
#dropping duplicate sales column - SO_confirmed_qty
merged_df = merged_df.drop('SO_confirmed_qty', axis=1)

In [50]:
merged_df.columns

Index(['calendar_day', 'sku', 'size', 'block', 'material_description', 'shape',
       'strategy', 'steel_grade', 'length_text', 'work_center', 'ph',
       'IN_tons', 'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton'],
      dtype='object')

In [54]:
#reorganizing column order
merged_df = merged_df[['calendar_day', 'block', 'size', 'sku', 'material_description', 'shape',
       'strategy', 'steel_grade', 'length_text', 'work_center', 'ph',
       'IN_tons', 'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton']]

In [55]:
merged_df.columns

Index(['calendar_day', 'year', 'month', 'week', 'block', 'size', 'sku',
       'material_description', 'shape', 'strategy', 'steel_grade',
       'length_text', 'work_center', 'ph', 'IN_tons',
       'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton'],
      dtype='object')

In [56]:
merged_df.groupby(['year','week','block' ]).sum().info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 15767 entries, (2016, 1, '#7 REB') to (2020, 51, '8" CHN')
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   IN_tons                   15767 non-null  float64
 1   SO_order_qty_sales_units  15767 non-null  float64
 2   SH_shipment_tons          15767 non-null  float64
 3   PR_ton                    15767 non-null  float64
dtypes: float64(4)
memory usage: 540.5+ KB


## Structural vs. Bar

In [57]:
#converting work_center column to string
#Structural = 25000001
#Bar = 25000002
merged_df['work_center'] = merged_df['work_center'].fillna(0).astype(int).astype(str)

### Structual 

In [58]:
#structural df
structural = merged_df[merged_df['work_center']=="25000001"] 

In [59]:
#dropping any rows after 2020-01-03
structural = structural[structural["calendar_day"]<="2020-01-03"]

In [60]:
# look at block level
structural[structural["block"].isna()].sum()

year                                                                   431661
month                                                                    1159
week                                                                     4710
block                                                                       0
size                                                                        0
sku                                                               23138934524
material_description        L 4X3X1/2 A36/44W/A529-50 40'00"F 1/4X3 A36/44...
shape                       SHAPE_LFLATSQUARESHAPE_CFLATSHAPE_LFLATFLATROU...
strategy                    MTOMTOMTSMTOMTOMTOMTOMTSMTOMTSMTOMTOMTOMTOMTOM...
length_text                 40'00"20'00"20'00"42'00"20'00"24'00"20'00"20'0...
work_center                 2500000125000001250000012500000125000001250000...
ph                                                                          0
IN_tons                                                         

In [61]:
#look at the size level
structural[structural["size"].isna()].sum()

year                                                                  1999700
month                                                                    6440
week                                                                    26204
size                                                                        0
sku                                                              105529138847
material_description        C 6X8.2# A36/44W/A529-50 36'00"C 6X8.2# A36/44...
strategy                    MTOMTOMTOMTOMTOMTOMTOMTOMTOMTOMTOMTOMTOMTOMTOM...
work_center                 2500000125000001250000012500000125000001250000...
IN_tons                                                                -0.777
SO_order_qty_sales_units                                                128.5
SH_shipment_tons                                                          126
PR_ton                                                                     49
dtype: object

In [62]:
#preview of structural by size
structural.groupby(['calendar_day','size']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
calendar_day,size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-02,3/4 X 5,0.00,3.0600,0.0,0.0
2016-01-02,5X5X5/8,0.00,39.0000,0.0,0.0
2016-01-03,1 X 1 3/4,0.00,4.9385,0.0,0.0
2016-01-03,1 X 10,10.20,0.0000,0.0,0.0
2016-01-03,1 X 12,2.04,0.0000,0.0,0.0
...,...,...,...,...,...
2020-01-03,8 X 11.5#,0.00,59.8000,59.0,0.0
2020-01-03,8 X 13.75#,0.00,8.2500,0.0,0.0
2020-01-03,8 X 18.75#,0.00,13.5000,10.0,0.0
2020-01-03,9 X 13.4#,0.00,9.6480,33.0,0.0


In [63]:
#preview of structural by block
structural.groupby(['calendar_day','block']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
calendar_day,block,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-02,"5"" ANG",0.0,39.0000,0.0,0.0
2016-01-02,"5"" FLT",0.0,3.0600,0.0,0.0
2016-01-03,"1"" FLT",0.0,10.0980,0.0,0.0
2016-01-03,"1"" SB FLT",0.0,22.5600,0.0,0.0
2016-01-03,"1-2"" LRG FLT",0.0,4.9385,0.0,0.0
...,...,...,...,...,...
2020-01-03,"8"" CHN",0.0,81.5500,69.0,0.0
2020-01-03,"8"" FLT",0.0,0.0000,9.0,0.0
2020-01-03,"9"" CHN",0.0,9.6480,47.0,0.0
2020-01-03,MC12X10.6,0.0,4.2400,0.0,0.0


In [64]:
#grouping structural material by block and size
structural = structural.groupby(['calendar_day','block', 'size']).sum()
structural = structural.reset_index()

In [65]:
structural.head()

Unnamed: 0,calendar_day,block,size,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
0,2016-01-02,"5"" ANG",5X5X5/8,0.0,39.0,0.0,0.0
1,2016-01-02,"5"" FLT",3/4 X 5,0.0,3.06,0.0,0.0
2,2016-01-03,"1"" FLT",3/4 X 1,0.0,10.098,0.0,0.0
3,2016-01-03,"1"" SB FLT",3/16 X 1 1/2,0.0,2.447,0.0,0.0
4,2016-01-03,"1"" SB FLT",3/4 X 1 1/2,0.0,7.353,0.0,0.0


In [78]:
#adding other time columns for aggregating analysis
structural['year'] = pd.DatetimeIndex(structural['calendar_day']).year
structural['month'] = pd.DatetimeIndex(structural['calendar_day']).month
structural['week'] = pd.DatetimeIndex(structural['calendar_day']).week

In [79]:
#changing time variables to object
structural.year = structural.year.astype('object')
structural.month = structural.month.astype('object')
structural.week = structural.week.astype('object')

In [80]:
structural.columns

Index(['calendar_day', 'block', 'size', 'IN_tons', 'SO_order_qty_sales_units',
       'SH_shipment_tons', 'PR_ton', 'year', 'month', 'week'],
      dtype='object')

In [81]:
# rearranging order of columns df
structural = structural[['calendar_day','year', 'month', 'week', 'block', 'size', 'IN_tons',
       'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton']]

In [82]:
structural.tail(2)

Unnamed: 0,calendar_day,year,month,week,block,size,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
158268,2020-01-03,2020,1,1,MC12X10.6,12 X 10.6#,0.0,4.24,0.0,0.0
158269,2020-01-03,2020,1,1,"MC4"" CHN",4 X 13.8#,0.0,0.0,11.0,0.0


In [69]:
structural.shape

(158270, 7)

## Bar

In [70]:
#bar df
bar = merged_df[merged_df['work_center']=="25000002"] 

In [71]:
#group by calendar, block & size
bar = bar.groupby(['calendar_day', 'block', 'size']).sum()
bar = bar.reset_index()

In [72]:
#dropping rows after 2020-01-31
bar = bar[bar["calendar_day"]<="2020-01-03"]

In [84]:
#adding other time columns for aggregating analysis
bar['year'] = pd.DatetimeIndex(bar['calendar_day']).year
bar['month'] = pd.DatetimeIndex(bar['calendar_day']).month
bar['week'] = pd.DatetimeIndex(bar['calendar_day']).week

In [86]:
#changing time variables to object
bar.year = bar.year.astype('object')
bar.month = bar.month.astype('object')
bar.week = bar.week.astype('object')

In [87]:
#reorganize order of columns
bar = bar[['calendar_day','year', 'month', 'week', 'block', 'size', 'IN_tons',
       'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton']]

In [88]:
bar.shape

(75018, 10)

In [89]:
bar.head(2)

Unnamed: 0,calendar_day,year,month,week,block,size,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
0,2016-01-01,2016,1,53,"2-1/2"" ANG",2 1/2X2 1/2X5/16,0.0,315.0,0.0,0.0
1,2016-01-02,2016,1,53,"3"" ANG",3X3X1/4,0.0,12.74,0.0,0.0


## Saving dataframes to CSV & Pickle

In [91]:
# Convert to csv for easy share - structural & bar df
structural.to_csv(merged_path + "structural_merged.csv")
bar.to_csv(merged_path + "bar_merged.csv")

In [92]:
# Convert to pickle for easy share - structural & bar df
structural.to_pickle(ser_path + "structural_merged.pkl")
bar.to_pickle(ser_path + "bar_merged.pkl")