# Exploring Merged File

In [1]:
#Dependencies
import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime as dt

In [2]:
os.getcwd()

'/Users/jacosta3/OneDrive - University of South Florida/Gerdau Projects/pft/scripts'

In [55]:
#change path to datasets folder
os.chdir("../datasets/")
#read merged.pkl
merged_df = pd.read_pickle("./serialized/merged.pkl")

In [57]:
#create file paths
ser_path = "./serialized/"
merged_path = "./merged_clean/"

In [4]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 814243 entries, 0 to 814242
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   calendar_day              814243 non-null  datetime64[ns]
 1   sku                       814243 non-null  object        
 2   size                      811071 non-null  object        
 3   block                     813827 non-null  object        
 4   material_description      813086 non-null  object        
 5   shape                     810269 non-null  object        
 6   strategy                  805552 non-null  object        
 7   steel_grade               810269 non-null  object        
 8   length_text               808869 non-null  object        
 9   work_center               814189 non-null  object        
 10  ph                        813878 non-null  object        
 11  IN_tons                   774593 non-null  float64       
 12  SO

In [5]:
#dropping duplicate sales column - SO_confirmed_qty
merged_df = merged_df.drop('SO_confirmed_qty', axis=1)

In [6]:
merged_df.columns

Index(['calendar_day', 'sku', 'size', 'block', 'material_description', 'shape',
       'strategy', 'steel_grade', 'length_text', 'work_center', 'ph',
       'IN_tons', 'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton'],
      dtype='object')

In [17]:
#adding other time columns for aggregating analysis
merged_df['year'] = pd.DatetimeIndex(merged_df['calendar_day']).year
merged_df['month'] = pd.DatetimeIndex(merged_df['calendar_day']).month
merged_df['week'] = pd.DatetimeIndex(merged_df['calendar_day']).week

In [29]:
#reorganizing column order
merged_df = merged_df[['calendar_day','year', 'month', 'week',
       'block', 'size', 'sku', 'material_description', 'shape',
       'strategy', 'steel_grade', 'length_text', 'work_center', 'ph',
       'IN_tons', 'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton']]

In [30]:
merged_df.columns

Index(['calendar_day', 'year', 'month', 'week', 'block', 'size', 'sku',
       'material_description', 'shape', 'strategy', 'steel_grade',
       'length_text', 'work_center', 'ph', 'IN_tons',
       'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton'],
      dtype='object')

In [31]:
merged_df.groupby(['year','week','block' ]).sum().info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 15767 entries, (2016, 1, '#7 REB') to (2020, 51, '8" CHN')
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   month                     15767 non-null  int64  
 1   IN_tons                   15767 non-null  float64
 2   SO_order_qty_sales_units  15767 non-null  float64
 3   SH_shipment_tons          15767 non-null  float64
 4   PR_ton                    15767 non-null  float64
dtypes: float64(4), int64(1)
memory usage: 663.6+ KB


## Structural vs. Bar

In [32]:
#converting work_center column to string
#Structural = 25000001
#Bar = 25000002
merged_df['work_center'] = merged_df['work_center'].fillna(0).astype(int).astype(str)

In [33]:
#structural df
structural = merged_df[merged_df['work_center']=="25000001"] 

### Structual 

In [35]:
# look at block level
structural[structural["block"].isna()].groupby('sku').sum().max()

year                        355024.0
month                          954.0
week                          3926.0
IN_tons                          0.0
SO_order_qty_sales_units       136.5
SH_shipment_tons                33.0
PR_ton                          28.0
dtype: float64

In [36]:
#look at the size level
structural[structural["size"].isna()].groupby('sku').sum().max()

year                        1691259.0
month                          5403.0
week                          21938.0
IN_tons                           0.0
SO_order_qty_sales_units        136.5
SH_shipment_tons                 33.0
PR_ton                           28.0
dtype: float64

In [37]:
#preview of structural by size
structural.groupby(['calendar_day','size']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month,week,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
calendar_day,size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-02,3/4 X 5,2016,1,53,0.00,3.0600,0.0,0.0
2016-01-02,5X5X5/8,2016,1,53,0.00,39.0000,0.0,0.0
2016-01-03,1 X 1 3/4,2016,1,53,0.00,4.9385,0.0,0.0
2016-01-03,1 X 10,2016,1,53,10.20,0.0000,0.0,0.0
2016-01-03,1 X 12,2016,1,53,2.04,0.0000,0.0,0.0
...,...,...,...,...,...,...,...,...
2020-09-11,3 1/2X3 1/2X3/8,2020,9,37,0.00,45.9000,0.0,0.0
2020-11-10,1/2 X 9,2020,11,46,0.00,4.2840,0.0,0.0
2020-11-10,3/8 X 9,2020,11,46,0.00,2.1810,0.0,0.0
2020-12-15,1 X 10,2020,12,51,0.00,2.0400,0.0,0.0


In [38]:
#preview of structural by block
structural.groupby(['calendar_day','block']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month,week,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
calendar_day,block,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-02,"5"" ANG",2016,1,53,0.0,39.0000,0.0,0.0
2016-01-02,"5"" FLT",2016,1,53,0.0,3.0600,0.0,0.0
2016-01-03,"1"" FLT",2016,1,53,0.0,10.0980,0.0,0.0
2016-01-03,"1"" SB FLT",6048,3,159,0.0,22.5600,0.0,0.0
2016-01-03,"1-2"" LRG FLT",2016,1,53,0.0,4.9385,0.0,0.0
...,...,...,...,...,...,...,...,...
2020-09-08,"7"" FLT",2020,9,37,0.0,14.2800,0.0,0.0
2020-09-11,"3-1/2"" ANG",2020,9,37,0.0,45.9000,0.0,0.0
2020-11-10,"9"" FLT",4040,22,92,0.0,6.4650,0.0,0.0
2020-12-15,"10"" FLT",2020,12,51,0.0,2.0400,0.0,0.0


In [39]:
#grouping structural material by block and size
structural = structural.groupby(['calendar_day','block', 'size']).sum()
structural = structural.reset_index()

In [40]:
structural.head()

Unnamed: 0,calendar_day,block,size,year,month,week,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
0,2016-01-02,"5"" ANG",5X5X5/8,2016,1,53,0.0,39.0,0.0,0.0
1,2016-01-02,"5"" FLT",3/4 X 5,2016,1,53,0.0,3.06,0.0,0.0
2,2016-01-03,"1"" FLT",3/4 X 1,2016,1,53,0.0,10.098,0.0,0.0
3,2016-01-03,"1"" SB FLT",3/16 X 1 1/2,2016,1,53,0.0,2.447,0.0,0.0
4,2016-01-03,"1"" SB FLT",3/4 X 1 1/2,2016,1,53,0.0,7.353,0.0,0.0


In [43]:
structural.columns

Index(['calendar_day', 'block', 'size', 'year', 'month', 'week', 'IN_tons',
       'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton'],
      dtype='object')

In [44]:
# rearranging order of columns df
structural = structural[['calendar_day','year', 'month', 'week', 'block', 'size', 'IN_tons',
       'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton']]

In [45]:
structural.head(2)

Unnamed: 0,calendar_day,year,month,week,block,size,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
0,2016-01-02,2016,1,53,"5"" ANG",5X5X5/8,0.0,39.0,0.0,0.0
1,2016-01-02,2016,1,53,"5"" FLT",3/4 X 5,0.0,3.06,0.0,0.0


In [47]:
structural.shape

(169642, 10)

## Bar

In [34]:
#bar df
bar = merged_df[merged_df['work_center']=="25000002"] 

In [50]:
#group by calendar, block & size
bar = bar.groupby(['calendar_day', 'block', 'size']).sum()
bar = bar.reset_index()

In [52]:
#reorganize order of columns
bar = bar[['calendar_day','year', 'month', 'week', 'block', 'size', 'IN_tons',
       'SO_order_qty_sales_units', 'SH_shipment_tons', 'PR_ton']]

In [54]:
bar.shape

(81174, 10)

In [53]:
bar.head(2)

Unnamed: 0,calendar_day,year,month,week,block,size,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
0,2016-01-01,2016,1,53,"2-1/2"" ANG",2 1/2X2 1/2X5/16,0.0,315.0,0.0,0.0
1,2016-01-02,2016,1,53,"3"" ANG",3X3X1/4,0.0,12.74,0.0,0.0


## Saving dataframes to CSV

In [58]:
# Convert to csv for easy share - structural & bar df
#structural.to_csv(merged_path + "structural_merged.csv")
#bar.to_csv(merged_path + "bar_merged.csv")