In [1]:
#Dependencies
import numpy as np
import pandas as pd
import datetime as dt
import re
from sklearn.preprocessing import LabelBinarizer #use instead of OneHotEncoder, skips spicymatrix?

In [2]:
os.getcwd()

'/Users/jacosta3/OneDrive - University of South Florida/Gerdau Projects/pft/scripts'

## Fetching Indicators info

In [3]:
#File directory
ser_path = "./serialized/"
path = "./merged_clean/"
#change directory to datasets
os.chdir("../datasets")

In [4]:
indicators = pd.read_excel(path+"market_indicators.xlsx",sheet_name="Monthly")

In [5]:
indicators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124 entries, 0 to 123
Data columns (total 17 columns):
 #   Column                                               Non-Null Count  Dtype         
---  ------                                               --------------  -----         
 0   dd/mm/yy                                             124 non-null    datetime64[ns]
 1   Year                                                 124 non-null    int64         
 2   Month                                                124 non-null    object        
 3   National Architect Billing/monthly                   121 non-null    float64       
 4   Total Federal Construction/Billion $                 121 non-null    float64       
 5   Total State and Local Construction/Billion $         121 non-null    float64       
 6   US Steel Capacity Utilization                        109 non-null    float64       
 7   US Long Steel/million $ per short ton                26 non-null     float64       
 8   

## Cleaning Columns

In [6]:
def clean_columns(dataframe):
    for col in dataframe.columns:
        dataframe.rename(columns={col:re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', col)}, inplace=True)
    dataframe.columns = dataframe.columns.str.strip().str.lower().str.replace(" ","_")
    try:
        dataframe["calendar_day"] = dataframe["calendar_day"].astype("datetime64")
    except:
        pass
    try:
        dataframe["requested_date"] = dataframe["requested_date"].astype("datetime64")
        dataframe["confirmed_date"] = dataframe["confirmed_date"].astype("datetime64")
        dataframe["material_avail_date"] = dataframe["material_avail_date"].astype("datetime64")
        dataframe["load_date"] = dataframe["load_date"].astype("datetime64")
        dataframe["plan_goods_issue_date"] = dataframe["plan_goods_issue_date"].astype("datetime64")
    except:
        pass
    try:
        dataframe["snapshot_date"] = dataframe["snapshot_date"].astype("datetime64")
    finally:
        return dataframe.columns

In [7]:
clean_columns(indicators)

Index(['dd/mm/yy', 'year', 'month', 'national_architect_billing/monthly',
       'total_federal_construction/billion_$',
       'total_state_and_local_construction/billion_$',
       'us_steel_capacity_utilization',
       'us_long_steel/million_$_per_short_ton',
       'us_domestic_hot-rolled_coil/million_$_per_short_ton',
       'msci_carbon_bar_monthly_shipments',
       'msci_carbon_bar_monthly_shipments_(canada)',
       'structural_shipments_/usa', 'structural_shipments_/canada',
       'ism_manufacturing', 'ism_non_manufacturing',
       'global_steel_utilization', 'chicago#1_scrap_bushelling'],
      dtype='object')

### Create Month & Year Columns

In [8]:
indicators['year'] = indicators['dd/mm/yy'].dt.year
indicators['month'] = indicators['dd/mm/yy'].dt.month

### Rename Columns

In [9]:
indicators.columns

Index(['dd/mm/yy', 'year', 'month', 'national_architect_billing/monthly',
       'total_federal_construction/billion_$',
       'total_state_and_local_construction/billion_$',
       'us_steel_capacity_utilization',
       'us_long_steel/million_$_per_short_ton',
       'us_domestic_hot-rolled_coil/million_$_per_short_ton',
       'msci_carbon_bar_monthly_shipments',
       'msci_carbon_bar_monthly_shipments_(canada)',
       'structural_shipments_/usa', 'structural_shipments_/canada',
       'ism_manufacturing', 'ism_non_manufacturing',
       'global_steel_utilization', 'chicago#1_scrap_bushelling'],
      dtype='object')

In [10]:
#Remane column & pick only desired ones 
indicators = indicators.rename(columns={'dd/mm/yy':'calendar_day',
                                        'total_federal_construction/billion_$':'fed_construction$B',                                                              'total_state_and_local_construction/billion_$':'state_local_construction$B',
                                        'msci_carbon_bar_monthly_shipments':'carbon_bar_ship_usa',
                                        'msci_carbon_bar_monthly_shipments_(canada)':'carbon_bar_ship_canada',
                                        'structural_shipments_/usa':'structural_ship_usa', 
                                        'structural_shipments_/canada':'structural_ship_canada',
                                        'chicago#1_scrap_bushelling':'chicago_scrap$ST'})
indicators = indicators[['calendar_day','year','month', 'global_steel_utilization','fed_construction$B',                                           'state_local_construction$B','carbon_bar_ship_usa','carbon_bar_ship_canada',                                              'structural_ship_usa','structural_ship_canada', 'ism_manufacturing', 'ism_non_manufacturing',                             'chicago_scrap$ST']]

### Selecting 2016 dates on

In [11]:
#select 2016-2020
indicators = indicators[(indicators['calendar_day'].dt.year >= 2016) & (indicators['calendar_day'] <='2020-01-01')]

In [12]:
indicators.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49 entries, 72 to 120
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   calendar_day                49 non-null     datetime64[ns]
 1   year                        49 non-null     int64         
 2   month                       49 non-null     int64         
 3   global_steel_utilization    45 non-null     float64       
 4   fed_construction$B          49 non-null     float64       
 5   state_local_construction$B  49 non-null     float64       
 6   carbon_bar_ship_usa         49 non-null     float64       
 7   carbon_bar_ship_canada      49 non-null     float64       
 8   structural_ship_usa         49 non-null     float64       
 9   structural_ship_canada      49 non-null     float64       
 10  ism_manufacturing           49 non-null     float64       
 11  ism_non_manufacturing       49 non-null     float64       

### Fill NaNs

In [13]:
#filling NaNs - assumption gradual increase 
values = {108:0.7768,109:0.7915,110:0.8063,111:0.8210}
indicators['global_steel_utilization'] = indicators['global_steel_utilization'].fillna(value=values, axis=0)

In [14]:
indicators.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49 entries, 72 to 120
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   calendar_day                49 non-null     datetime64[ns]
 1   year                        49 non-null     int64         
 2   month                       49 non-null     int64         
 3   global_steel_utilization    49 non-null     float64       
 4   fed_construction$B          49 non-null     float64       
 5   state_local_construction$B  49 non-null     float64       
 6   carbon_bar_ship_usa         49 non-null     float64       
 7   carbon_bar_ship_canada      49 non-null     float64       
 8   structural_ship_usa         49 non-null     float64       
 9   structural_ship_canada      49 non-null     float64       
 10  ism_manufacturing           49 non-null     float64       
 11  ism_non_manufacturing       49 non-null     float64       

## Save to file

In [15]:
#indicators.to_pickle(ser_path+'indicators.pkl')

## Preparing Structural df
Consider whether monthly value ingestion may be better than weekly

### Importing Structural df

In [16]:
structural = pd.read_pickle(ser_path+"structural_merged.pkl")

In [17]:
structural.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158270 entries, 0 to 158269
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   calendar_day              158270 non-null  datetime64[ns]
 1   year                      158270 non-null  object        
 2   month                     158270 non-null  object        
 3   week                      158270 non-null  object        
 4   block                     158270 non-null  object        
 5   size                      158270 non-null  object        
 6   IN_tons                   158270 non-null  float64       
 7   SO_order_qty_sales_units  158270 non-null  float64       
 8   SH_shipment_tons          158270 non-null  float64       
 9   PR_ton                    158270 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 12.1+ MB


In [18]:
#dropping columns to permit merging
structural = structural.drop(columns=['year','month','week'])

In [19]:
structural.columns

Index(['calendar_day', 'block', 'size', 'IN_tons', 'SO_order_qty_sales_units',
       'SH_shipment_tons', 'PR_ton'],
      dtype='object')

### Creating Weekly dataframe for merging

In [20]:
weekly = structural.groupby('block').resample('W', on='calendar_day').sum().sort_values('calendar_day').reset_index()

In [21]:
weekly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12519 entries, 0 to 12518
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   block                     12519 non-null  object        
 1   calendar_day              12519 non-null  datetime64[ns]
 2   IN_tons                   12519 non-null  float64       
 3   SO_order_qty_sales_units  12519 non-null  float64       
 4   SH_shipment_tons          12519 non-null  float64       
 5   PR_ton                    12519 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 587.0+ KB


In [22]:
weekly.head(10)

Unnamed: 0,block,calendar_day,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
0,"3-1/2"" ANG",2016-01-03,207.261,0.0,0.0,0.0
1,"9"" CHN",2016-01-03,43.296,0.0,0.0,0.0
2,"8 X 4"" ANG",2016-01-03,14.112,0.0,0.0,0.0
3,"6"" CHN",2016-01-03,1857.997,0.0,0.0,0.0
4,"4 X 3"" ANG",2016-01-03,472.99,0.0,0.0,0.0
5,"5 X 3-1/2"" ANG",2016-01-03,1888.34,0.0,0.0,0.0
6,"6"" FLT",2016-01-03,1102.054,2.244,0.0,0.0
7,"8"" FLT",2016-01-03,867.865,2.244,0.0,0.0
8,"5 X 3"" ANG",2016-01-03,1172.033,0.0,0.0,0.0
9,"8"" CHN",2016-01-03,958.505,0.0,0.0,0.0


In [23]:
#recreating time columns for merging
weekly['month'] = weekly['calendar_day'].dt.month
weekly['year'] = weekly['calendar_day'].dt.year

In [24]:
#weekly data ready for merging
weekly = weekly[['calendar_day','year','month','block','IN_tons','PR_ton','SO_order_qty_sales_units','SH_shipment_tons']]
weekly.tail()

Unnamed: 0,calendar_day,year,month,block,IN_tons,PR_ton,SO_order_qty_sales_units,SH_shipment_tons
12514,2020-01-05,2020,1,"7"" FLT",329.272012,0.0,4.76,0.0
12515,2020-01-05,2020,1,"10"" CHN",6794.873987,-1.0,142.238,162.0
12516,2020-01-05,2020,1,"6"" CHN",1631.336032,0.0,122.52,48.0
12517,2020-01-05,2020,1,"9"" FLT",0.052,0.0,0.0,0.0
12518,2020-01-05,2020,1,"5"" FLT",670.258017,0.0,32.895,23.0


### Creating Monthly for Merging

In [26]:
monthly = structural.groupby('block').resample('M', on='calendar_day').sum().sort_values('calendar_day').reset_index()

In [27]:
monthly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2936 entries, 0 to 2935
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   block                     2936 non-null   object        
 1   calendar_day              2936 non-null   datetime64[ns]
 2   IN_tons                   2936 non-null   float64       
 3   SO_order_qty_sales_units  2936 non-null   float64       
 4   SH_shipment_tons          2936 non-null   float64       
 5   PR_ton                    2936 non-null   float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 137.8+ KB


In [28]:
monthly.head(5)

Unnamed: 0,block,calendar_day,IN_tons,SO_order_qty_sales_units,SH_shipment_tons,PR_ton
0,"3-1/2"" ANG",2016-01-31,21287.255,880.992,922.0,2008.0
1,"5"" FLT",2016-01-31,5544.345,197.17,140.0,0.0
2,"2 X 1"" CHN",2016-01-31,5.008,14.607,6.0,0.0
3,"15"" CHN",2016-01-31,156.618,0.0,9.0,0.0
4,"6 X 3-1/2"" ANG",2016-01-31,17769.578,376.07,501.0,-17.0


In [29]:
#recreating time columns for merging
monthly['month'] = monthly['calendar_day'].dt.month
monthly['year'] = monthly['calendar_day'].dt.year

In [30]:
#monthly data ready for merging
monthly= monthly[['calendar_day','year','month','block','IN_tons','PR_ton','SO_order_qty_sales_units','SH_shipment_tons']]
monthly.tail()

Unnamed: 0,calendar_day,year,month,block,IN_tons,PR_ton,SO_order_qty_sales_units,SH_shipment_tons
2931,2020-01-31,2020,1,"MC4"" CHN",233.496001,0.0,0.0,11.0
2932,2020-01-31,2020,1,"4 X 3"" ANG",1184.441982,0.0,69.13,66.0
2933,2020-01-31,2020,1,"6"" CHN",801.832017,0.0,53.508,48.0
2934,2020-01-31,2020,1,"5"" CHN",432.594001,0.0,0.0,17.0
2935,2020-01-31,2020,1,"1"" SB FLT",-0.002,0.0,0.0,0.0


## Merging Structural+Indicators

### Weekly Merge with Indicators

In [26]:
#Merginging weekly values with indicators
struct_indicators = weekly.merge(indicators, how='left', on=['year','month'], validate='m:1')

In [27]:
#Drop duplicate column and rename main weekly calendar day to calendar_day
struct_indicators = struct_indicators.drop(columns='calendar_day_y').rename(columns={'calendar_day_x':'calendar_day'})

In [29]:
struct_indicators.sample(10)

Unnamed: 0,calendar_day,year,month,block,IN_tons,PR_ton,SO_order_qty_sales_units,SH_shipment_tons,global_steel_utilization,fed_construction$B,state_local_construction$B,carbon_bar_ship_usa,carbon_bar_ship_canada,structural_ship_usa,structural_ship_canada,ism_manufacturing,ism_non_manufacturing,chicago_scrap$ST
7013,2018-04-01,2018,4,"3/16X3/4"" FLT",0.0,0.0,0.0,0.0,0.771,1630.0,21280.0,243100.0,31900.0,223200.0,37700.0,57.3,56.8,385.0
4346,2017-06-11,2017,6,"5 X 3"" ANG",1138.654989,0.0,117.048,120.0,0.747,1905.0,23970.0,243300.0,33200.0,216300.0,38700.0,57.8,57.4,350.0
2774,2016-12-18,2016,12,"1"" FLT",5.028,0.0,0.0,5.0,0.682,2026.0,17951.0,171700.0,22400.0,174600.0,28400.0,54.5,56.6,275.0
3354,2017-02-19,2017,2,"5"" CHN",2264.804957,0.0,17.688,123.0,0.715,1678.0,16242.0,219300.0,28500.0,180400.0,32200.0,57.7,57.6,305.0
8890,2018-11-04,2018,11,"1-1/4"" FLT",0.0,0.0,0.0,0.0,0.762,1880.0,23202.0,205600.0,33300.0,189200.0,38000.0,59.3,60.7,405.0
10959,2019-06-23,2019,6,"1/2"" SQR",0.005,0.0,0.0,0.0,0.82,2007.0,28994.0,186600.0,32500.0,184000.0,36700.0,51.7,55.1,285.0
1928,2016-09-11,2016,9,"3"" FLT",7.752,0.0,4.896,7.0,0.685,2405.0,25578.0,192500.0,29200.0,188800.0,37600.0,51.5,57.1,235.0
3189,2017-02-05,2017,2,"10"" CHN",3687.908011,118.0,99.818,131.0,0.715,1678.0,16242.0,219300.0,28500.0,180400.0,32200.0,57.7,57.6,305.0
8925,2018-11-04,2018,11,"2 X 1"" CHN",2.285,0.0,10.022,2.0,0.762,1880.0,23202.0,205600.0,33300.0,189200.0,38000.0,59.3,60.7,405.0
11295,2019-08-04,2019,8,"2"" MED FLT",0.0,0.0,0.0,0.0,0.802,2146.0,31328.0,197600.0,31900.0,208600.0,38500.0,49.1,56.4,300.0


In [30]:
struct_indicators.columns

Index(['calendar_day', 'year', 'month', 'block', 'IN_tons', 'PR_ton',
       'SO_order_qty_sales_units', 'SH_shipment_tons',
       'global_steel_utilization', 'fed_construction$B',
       'state_local_construction$B', 'carbon_bar_ship_usa',
       'carbon_bar_ship_canada', 'structural_ship_usa',
       'structural_ship_canada', 'ism_manufacturing', 'ism_non_manufacturing',
       'chicago_scrap$ST'],
      dtype='object')

In [32]:
#Rearanging Order
struct_indicators = struct_indicators[['calendar_day', 'year', 'month',
       'global_steel_utilization', 'fed_construction$B',
       'state_local_construction$B', 'carbon_bar_ship_usa',
       'carbon_bar_ship_canada', 'structural_ship_usa',
       'structural_ship_canada', 'ism_manufacturing', 'ism_non_manufacturing',
       'chicago_scrap$ST', 'block', 'IN_tons', 'PR_ton',
       'SO_order_qty_sales_units', 'SH_shipment_tons',]]

In [33]:
struct_indicators.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12519 entries, 0 to 12518
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   calendar_day                12519 non-null  datetime64[ns]
 1   year                        12519 non-null  int64         
 2   month                       12519 non-null  int64         
 3   global_steel_utilization    12519 non-null  float64       
 4   fed_construction$B          12519 non-null  float64       
 5   state_local_construction$B  12519 non-null  float64       
 6   carbon_bar_ship_usa         12519 non-null  float64       
 7   carbon_bar_ship_canada      12519 non-null  float64       
 8   structural_ship_usa         12519 non-null  float64       
 9   structural_ship_canada      12519 non-null  float64       
 10  ism_manufacturing           12519 non-null  float64       
 11  ism_non_manufacturing       12519 non-null  float64   

### Monthly Merge with Indicators

In [31]:
#Merginging weekly values with indicators
monthly_indicators = monthly.merge(indicators, how='left', on=['year','month'], validate='m:1')

In [32]:
monthly_indicators.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2936 entries, 0 to 2935
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   calendar_day_x              2936 non-null   datetime64[ns]
 1   year                        2936 non-null   int64         
 2   month                       2936 non-null   int64         
 3   block                       2936 non-null   object        
 4   IN_tons                     2936 non-null   float64       
 5   PR_ton                      2936 non-null   float64       
 6   SO_order_qty_sales_units    2936 non-null   float64       
 7   SH_shipment_tons            2936 non-null   float64       
 8   calendar_day_y              2936 non-null   datetime64[ns]
 9   global_steel_utilization    2936 non-null   float64       
 10  fed_construction$B          2936 non-null   float64       
 11  state_local_construction$B  2936 non-null   float64     

In [33]:
#Drop duplicate column and rename main monthly calendar day to calendar_day
monthly_indicators = monthly_indicators.drop(columns='calendar_day_y').rename(columns={'calendar_day_x':'calendar_day'})

In [34]:
#Rearanging Order
monthly_indicators = monthly_indicators[['calendar_day', 'year', 'month',
       'global_steel_utilization', 'fed_construction$B',
       'state_local_construction$B', 'carbon_bar_ship_usa',
       'carbon_bar_ship_canada', 'structural_ship_usa',
       'structural_ship_canada', 'ism_manufacturing', 'ism_non_manufacturing',
       'chicago_scrap$ST', 'block', 'IN_tons', 'PR_ton',
       'SO_order_qty_sales_units', 'SH_shipment_tons',]]

In [35]:
monthly_indicators.columns

Index(['calendar_day', 'year', 'month', 'global_steel_utilization',
       'fed_construction$B', 'state_local_construction$B',
       'carbon_bar_ship_usa', 'carbon_bar_ship_canada', 'structural_ship_usa',
       'structural_ship_canada', 'ism_manufacturing', 'ism_non_manufacturing',
       'chicago_scrap$ST', 'block', 'IN_tons', 'PR_ton',
       'SO_order_qty_sales_units', 'SH_shipment_tons'],
      dtype='object')

In [36]:
monthly_indicators.sample(10)

Unnamed: 0,calendar_day,year,month,global_steel_utilization,fed_construction$B,state_local_construction$B,carbon_bar_ship_usa,carbon_bar_ship_canada,structural_ship_usa,structural_ship_canada,ism_manufacturing,ism_non_manufacturing,chicago_scrap$ST,block,IN_tons,PR_ton,SO_order_qty_sales_units,SH_shipment_tons
1460,2018-01-31,2018,1,0.727,2199.0,16621.0,247100.0,34200.0,218000.0,37400.0,59.1,59.9,375.0,"1-2"" LRG FLT",4.896,0.0,2.448,6.0
2712,2019-09-30,2019,9,0.783,2590.0,30251.0,179000.0,30900.0,195800.0,37300.0,47.8,52.6,260.0,"9"" CHN",6919.291984,0.0,125.256,118.0
1915,2018-08-31,2018,8,0.7603,2025.0,29939.0,250500.0,33200.0,231500.0,38500.0,61.3,58.5,395.0,"3-1/2"" ANG",55161.386247,3339.0,1153.475,1549.0
1653,2018-04-30,2018,4,0.771,1630.0,21280.0,243100.0,31900.0,223200.0,37700.0,57.3,56.8,385.0,"1-1/4"" SQR",0.0,0.0,0.0,0.0
2839,2019-11-30,2019,11,0.793,2026.0,25357.0,159600.0,32600.0,176900.0,35400.0,48.1,53.9,240.0,"5/8"" SQR",0.0,0.0,5.046,0.0
64,2016-02-29,2016,2,0.66,1840.0,17198.0,205400.0,31000.0,187000.0,37500.0,49.5,53.4,180.0,"8"" CHN",17624.909,12.0,561.765,674.0
2898,2020-01-31,2020,1,0.819,1970.0,19685.0,198100.0,37400.0,202700.0,33500.0,50.9,55.5,300.0,"5 X 3-1/2"" ANG",2109.863982,0.0,76.32,56.0
839,2017-03-31,2017,3,0.723,1924.0,17953.0,257400.0,34900.0,217100.0,37300.0,57.2,55.2,370.0,"1-1/4"" SQR",0.0,0.0,2.55,0.0
1615,2018-03-31,2018,3,0.747,1689.0,18625.0,255200.0,34700.0,234500.0,39300.0,59.3,58.8,375.0,"1"" CHN",0.0,0.0,0.0,3.0
1410,2017-12-31,2017,12,0.694,2235.0,18835.0,195400.0,24100.0,183400.0,30700.0,59.7,55.9,345.0,"6"" FLT",37076.147938,0.0,643.65,396.0


### Saving to formats for digestion

In [34]:
#Saving df into pickle and csv
#struct_indicators.to_csv(path+"structuralweekly_with_indicators.csv")
#struct_indicators.to_pickle(ser_path+"structuralweekly_with_indicators.pkl")

In [37]:
#Saving df into pickle and csv
#monthly_indicators.to_csv(path+"structuralmonthly_with_indicators.csv")
#monthly_indicators.to_pickle(ser_path+"structuralmonthly_with_indicators.pkl")