# Supply and Demand: Machine learning Testing

### Objective 
Establish a correlation between the CUST_ID in the supply data and the potential amount of SKU_ID against the POS_Name based on BLDG_ID and SqFt_est using a random forest machine learning model.

### Conceptual Diagram
SUPPLY
Table A -> Table B -> Table C[CUST_ID]

DEMAND
Table X -> Table Y -> Table Z[POS_NAME]

### Outline
1. Data Preparation
2. Feature Engineering 
3. Splitting Data
4. Random Forest Model
5. Model Evaluation

---
# Data Preparation

In [36]:
import pandas as pd

In [37]:
# BASELINE TABLES FILEPATHS
table_1_path = '../data/0_key.csv'
table_2_path = '../data/0_sctr.csv'
table_3_path = '../data/0_sku.csv'

# SUPPLY TABLES FILEPATHS
table_A_path = '../data/1_dist.csv'
table_B_path = '../data/1_inv.csv'
table_C_path = '../data/1_smtx.csv'

# DEMAND TABLES FILEPATHS
table_X_path = '../data/2_bldg.csv'
table_Y_path = '../data/2_biz.csv'
table_Z_path = '../data/2_dmtx.csv'

# CORRELATION DATA FILEPATHS
table_5_path = '../data/2_use.csv'

In [38]:
def read_and_clean_csv(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Drop rows with NaN values
    df_cleaned = df.dropna(how='all')
    
    return df_cleaned

In [39]:
# Read and clean the CSV files
table_A_df = read_and_clean_csv(table_A_path)
table_B_df = read_and_clean_csv(table_B_path)
table_C_df = read_and_clean_csv(table_C_path)

table_X_df = read_and_clean_csv(table_X_path)
table_Y_df = read_and_clean_csv(table_Y_path)
table_Z_df = read_and_clean_csv(table_Z_path)

table_1_df = read_and_clean_csv(table_1_path)
table_2_df = read_and_clean_csv(table_2_path)
table_3_df = read_and_clean_csv(table_3_path)
table_5_df = read_and_clean_csv(table_5_path)

##### Merging into two major tables

In [40]:
fs_index = table_1_df
htx_sectors = table_2_df
fs_types = table_3_df

land_use = table_5_df

In [41]:
dist_list = table_A_df
dist_inv = table_B_df
supply_matrix = table_C_df

In [42]:
bldg_list = table_X_df
bldg_inv = table_Y_df
demand_matrix = table_Z_df 

---
### Supply

In [43]:
#du_col_drop = ['Unnamed: 8','Unnamed: 9']
#dist_list = dist_list.drop(columns=du_col_drop)
dist_list.head(4)

Unnamed: 0,CUST_ID,schema_id,CUST__NAME,supply_desc,Address Type,CUST_ADDRESS,longitude,latitude
0,2001000,S02,Banyan Foods,food packaging,point_of_sale,"10940 Wilcrest Dr, Houston, TX 77099",-95.568779,29.664062
1,2001001,S02,Houston Calco,food manufacturing,warehouse,"2400 Dallas St, Houston, TX 77003",-95.354057,29.74809
2,2001002,S02,Tan Tan Tofu,food manufacturing,point_of_sale,"6791 Wilcrest Dr. Houston, TX 77072",-95.571122,29.704095
3,2001003,S02,Thanh Son Tofu,food manufacturing,registered_address,"13574 SAN MARTIN LN HOUSTON, TX 77083",-95.623345,29.695825


In [44]:
dist_inv.head(2)

Unnamed: 0,CUST_ID,CUST__NAME,SKU_ID,sku_name,POS_Pri,POS_Sec,POS_Ter,B01_supply,B02_supply,B03_supply,S01_supply,S02_supply,S03_supply
0,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,0.0,675.463041,0.0
1,2001000,Banyan Foods,1001019,soymilk,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,0.0,545.566303,0.0


In [45]:
dist_inv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CUST_ID     169 non-null    int64  
 1   CUST__NAME  169 non-null    object 
 2   SKU_ID      169 non-null    int64  
 3   sku_name    169 non-null    object 
 4   POS_Pri     169 non-null    object 
 5   POS_Sec     169 non-null    object 
 6   POS_Ter     169 non-null    object 
 7   B01_supply  169 non-null    float64
 8   B02_supply  169 non-null    float64
 9   B03_supply  169 non-null    float64
 10  S01_supply  169 non-null    float64
 11  S02_supply  169 non-null    float64
 12  S03_supply  169 non-null    float64
dtypes: float64(6), int64(2), object(5)
memory usage: 17.3+ KB


In [46]:
dist_inv.rename(columns={'POS_Sec':'POS_TYPE_focus'}, inplace=True)

dist_inv as demand table

In [47]:
supply_data = dist_inv

---
### Demand

In [48]:
bldg_list.head(4)

Unnamed: 0,BLDG_ID,SqFt_per1k,SqFt_est,sectorid,POS_StateClass,POS_StateClassCd,longitude,latitude
0,317,1.77722,1777.219873,1,RESIDENTIAL,A1,-95.3526,29.76921
1,630,2.179124,2179.123515,1,RESIDENTIAL,A1,-95.3526,29.76901
2,1132,2.127309,2127.308579,1,RESIDENTIAL,A1,-95.3525,29.76926
3,1277,4.427083,4427.082666,1,RESIDENTIAL,A1,-95.3545,29.76754


In [49]:
bldg_inv.head(4)

Unnamed: 0,BLDG_ID,POS_Contact,POS_ID,POS_Name,Longitude,Latitude,POS_SELSIC,POS_Standard Industrial Classification,POS_NAICS,POS_North American Industry Classification,POS_TYPE_focus,POS_Revenue,Owner_Ethnicity,Owner_Type,Owner_Established,Owner_CreditScore,SqFtCd,SqFt_range
0,1312.0,MUHAMMAD ABBAS,70001000,STAR LINE LOGISTICS LLC,-95.3663,29.7495,874266,LOGISTICS,54161125,ADMINISTRATIVE & GENERAL MGMT CONSULTING SERVICES,,$1-2.5 MILLION,PAKISTANI,FIRM,2020,B+,5.0,"10,000 - 19,999"
1,1045.0,RAYNEL L HUDSON,70001001,LRJ LOGISTICS SOLUTIONS LLC,-95.3686,29.7518,874266,LOGISTICS,54161125,ADMINISTRATIVE & GENERAL MGMT CONSULTING SERVICES,,"$500,000-$1 MILLION",ENGLISH,FIRM,2021,B+,,
2,675.0,,70001002,GUIDEHOUSE INC,-95.3636,29.7591,874264,CONSULTING SERVICES,54161113,ADMINISTRATIVE & GENERAL MGMT CONSULTING SERVICES,,,,FIRM,2021,U,,
3,777.0,MATTHIAS WONG,70001003,RBC DOMINION SECURITIES C & CO,-95.3624,29.7593,874209,FINANCIAL MANAGEMENT & CONSULTING,54161114,ADMINISTRATIVE & GENERAL MGMT CONSULTING SERVICES,,$5-10 MILLION,CHINESE,FIRM,2000,A+,8.0,"100,000+"


In [50]:
bldg_demand_inv = bldg_inv[['BLDG_ID','POS_ID','POS_Standard Industrial Classification','POS_TYPE_focus','POS_Revenue','Owner_Established']]

In [51]:
bldg_demand_inv = bldg_demand_inv.dropna(subset=['POS_TYPE_focus'])
bldg_demand_inv

Unnamed: 0,BLDG_ID,POS_ID,POS_Standard Industrial Classification,POS_TYPE_focus,POS_Revenue,Owner_Established
188,513.0,70001188,YOGURT,FOOD STORES,"LESS THAN $500,000",2017
189,400.0,70001189,WATER COMPANIES-BOTTLED/BULK & ETC,FOOD STORES,"LESS THAN $500,000",2016
190,655.0,70001190,HERBS,FOOD STORES,"LESS THAN $500,000",1997
191,1223.0,70001191,COFFEE & TEA,FOOD STORES,"LESS THAN $500,000",2015
375,858.0,70001375,BEER & ALE-RETAIL,BEVERAGE STORES,,2016
...,...,...,...,...,...,...
5229,735.0,70006229,THEATRES-LIVE,ENTERTAINMENT,"LESS THAN $500,000",2013
5230,735.0,70006230,PERFORMING ARTS,ENTERTAINMENT,"LESS THAN $500,000",2012
5231,,70006231,OPERA COMPANIES,ENTERTAINMENT,$20-50 MILLION,1991
5232,,70006232,THEATRES-LIVE,ENTERTAINMENT,$2.5-5 MILLION,2001


In [52]:
demand_matrix

Unnamed: 0,POS_StateClass,d_min,d_max
0,COMMERCIAL,0.1285,0.24
1,COMMUNITY,0.01,0.04
2,INDUSTRIAL,0.1285,0.45
3,RESIDENTIAL,0.01,0.04
4,UNCLASSED,0.0,0.0
5,UTILITY,0.0,0.0
6,VACANT,0.0,0.0


In [53]:
bldg_biz = bldg_demand_inv.merge(bldg_list[['BLDG_ID','POS_StateClass','SqFt_per1k','sectorid']],on='BLDG_ID',how='left')

In [54]:
demand_data = bldg_biz.merge(demand_matrix, on='POS_StateClass',how='left')

In [55]:
demand_data.head(4)

Unnamed: 0,BLDG_ID,POS_ID,POS_Standard Industrial Classification,POS_TYPE_focus,POS_Revenue,Owner_Established,POS_StateClass,SqFt_per1k,sectorid,d_min,d_max
0,513.0,70001188,YOGURT,FOOD STORES,"LESS THAN $500,000",2017,RESIDENTIAL,32.968122,6.0,0.01,0.04
1,400.0,70001189,WATER COMPANIES-BOTTLED/BULK & ETC,FOOD STORES,"LESS THAN $500,000",2016,COMMERCIAL,61.716932,9.0,0.1285,0.24
2,655.0,70001190,HERBS,FOOD STORES,"LESS THAN $500,000",1997,COMMERCIAL,17.280532,0.0,0.1285,0.24
3,1223.0,70001191,COFFEE & TEA,FOOD STORES,"LESS THAN $500,000",2015,COMMERCIAL,10.54927,0.0,0.1285,0.24


---
### Merged

In [56]:
primary_comparison = supply_data.merge(demand_data,on='POS_TYPE_focus',how='left')

In [57]:
primary_comparison.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31886 entries, 0 to 31885
Data columns (total 23 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   CUST_ID                                 31886 non-null  int64  
 1   CUST__NAME                              31886 non-null  object 
 2   SKU_ID                                  31886 non-null  int64  
 3   sku_name                                31886 non-null  object 
 4   POS_Pri                                 31886 non-null  object 
 5   POS_TYPE_focus                          31886 non-null  object 
 6   POS_Ter                                 31886 non-null  object 
 7   B01_supply                              31886 non-null  float64
 8   B02_supply                              31886 non-null  float64
 9   B03_supply                              31886 non-null  float64
 10  S01_supply                              31886 non-null  fl

---

# Feature Engineering

In [58]:
pri_comp_onehot = ['sectorid','POS_Revenue']

In [59]:
pri_comp_encoded = pd.get_dummies(primary_comparison, columns=pri_comp_onehot)

In [60]:
pri_comp_encoded.head(4)

Unnamed: 0,CUST_ID,CUST__NAME,SKU_ID,sku_name,POS_Pri,POS_TYPE_focus,POS_Ter,B01_supply,B02_supply,B03_supply,...,sectorid_8.0,sectorid_9.0,sectorid_10.0,sectorid_11.0,POS_Revenue_$1-2.5 MILLION,POS_Revenue_$2.5-5 MILLION,POS_Revenue_$20-50 MILLION,POS_Revenue_$5-10 MILLION,"POS_Revenue_$500,000-$1 MILLION","POS_Revenue_LESS THAN $500,000"
0,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1
3,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


---
# Splitting the Datasets

In [61]:
import numpy as np


In [62]:
X = pri_comp_encoded[['SqFt_per1k', 'POS_Revenue_$5-10 MILLION', 'POS_Revenue_$2.5-5 MILLION', 'POS_Revenue_LESS THAN $500,000']]


In [63]:
y_columns = ['B01_supply', 'B02_supply', 'B03_supply', 'S01_supply', 'S02_supply', 'S03_supply']
y = pri_comp_encoded[y_columns]


In [66]:
multiplier = np.where(pri_comp_encoded['POS_Revenue_$5-10 MILLION'] == 1, 0.75,
                               np.where(pri_comp_encoded['POS_Revenue_$2.5-5 MILLION'] == 1, 0.5,
                                        np.where(pri_comp_encoded['POS_Revenue_LESS THAN $500,000'] == 1, 0.35, 1)))

In [67]:
for col in y_columns:
    X[col] = pri_comp_encoded['SqFt_per1k'] * multiplier

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pri_comp_encoded['SqFt_per1k'] * multiplier
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pri_comp_encoded['SqFt_per1k'] * multiplier
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pri_comp_encoded['SqFt_per1k'] * multiplier
A value is trying to be set on a copy of a sli

In [68]:
X = X.fillna(0) 

----


In [69]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


---
# Running the Model

In [71]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [72]:
y_pred = model.predict(X_test)


In [73]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 2126.01249266946


---
# Predictions

In [74]:
predicted_inv = model.predict(X)

In [75]:
predicted_supply_df = pd.DataFrame(predicted_inv, columns=y_columns)

In [76]:
pri_comp_encoded[['B01_inv', 'B02_inv', 'B03_inv', 'S01_inv', 'S02_inv', 'S03_inv']] = predicted_supply_df


In [77]:
pri_comp_encoded.head(4)

Unnamed: 0,CUST_ID,CUST__NAME,SKU_ID,sku_name,POS_Pri,POS_TYPE_focus,POS_Ter,B01_supply,B02_supply,B03_supply,...,POS_Revenue_$20-50 MILLION,POS_Revenue_$5-10 MILLION,"POS_Revenue_$500,000-$1 MILLION","POS_Revenue_LESS THAN $500,000",B01_inv,B02_inv,B03_inv,S01_inv,S02_inv,S03_inv
0,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,...,0,0,0,1,4960.404059,1675.063336,454.739557,0.0,62.738716,0.0
1,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,...,0,0,0,1,5006.507034,1745.41448,454.234736,735.510691,30.32179,240.227
2,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,...,0,0,0,1,0.0,49.252964,0.0,0.0,622.904347,0.0
3,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,...,0,0,1,0,0.0,0.0,0.0,0.0,605.760414,0.0


In [78]:
pri_comp_encoded.to_csv('../data/sec_comp_inv.csv',index=False)