# Supply and Demand: Machine learning Testing

### Objective 
Establish a correlation between the CUST_ID in the supply data and the potential amount of SKU_ID against the POS_Name based on BLDG_ID and SqFt_est using a random forest machine learning model.

### Conceptual Diagram
SUPPLY
Table A -> Table B -> Table C[CUST_ID]

DEMAND
Table X -> Table Y -> Table Z[POS_NAME]

### Outline
1. Data Preparation
2. Feature Engineering 
3. Splitting Data
4. Random Forest Model
5. Model Evaluation

---
# Data Preparation

In [1]:
import pandas as pd

In [2]:
# BASELINE TABLES FILEPATHS
table_1_path = '../data/0_key.csv'
table_2_path = '../data/0_sctr.csv'
table_3_path = '../data/0_sku.csv'

# SUPPLY TABLES FILEPATHS
table_A_path = '../data/1_dist.csv'
table_B_path = '../data/1_inv.csv'
table_C_path = '../data/1_smtx.csv'

# DEMAND TABLES FILEPATHS
table_X_path = '../data/2_bldg.csv'
table_Y_path = '../data/2_biz.csv'
table_Z_path = '../data/2_dmtx.csv'

# CORRELATION DATA FILEPATHS
table_5_path = '../data/2_use.csv'

In [10]:
def read_and_clean_csv(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Drop rows with NaN values
    df_cleaned = df.dropna(how='all')
    
    return df_cleaned

In [11]:
# Read and clean the CSV files
table_A_df = read_and_clean_csv(table_A_path)
table_B_df = read_and_clean_csv(table_B_path)
table_C_df = read_and_clean_csv(table_C_path)

table_X_df = read_and_clean_csv(table_X_path)
table_Y_df = read_and_clean_csv(table_Y_path)
table_Z_df = read_and_clean_csv(table_Z_path)

table_1_df = read_and_clean_csv(table_1_path)
table_2_df = read_and_clean_csv(table_2_path)
table_3_df = read_and_clean_csv(table_3_path)
table_5_df = read_and_clean_csv(table_5_path)

##### Merging into two major tables

In [12]:
fs_index = table_1_df
htx_sectors = table_2_df
fs_types = table_3_df

land_use = table_5_df

In [13]:
dist_list = table_A_df
dist_inv = table_B_df
supply_matrix = table_C_df

In [14]:
bldg_list = table_X_df
bldg_inv = table_Y_df
demand_matrix = table_Z_df 

---
### Supply

In [19]:
du_col_drop = ['Unnamed: 8','Unnamed: 9']
dist_list = dist_list.drop(columns=du_col_drop)
dist_list.head(4)

Unnamed: 0,CUST_ID,schema_id,CUST__NAME,supply_desc,Address Type,CUST_ADDRESS,longitude,latitude
0,2001000,S02,Banyan Foods,food packaging,point_of_sale,"10940 Wilcrest Dr, Houston, TX 77099",-95.568779,29.664062
1,2001001,S02,Houston Calco,food manufacturing,warehouse,"2400 Dallas St, Houston, TX 77003",-95.354057,29.74809
2,2001002,S02,Tan Tan Tofu,food manufacturing,point_of_sale,"6791 Wilcrest Dr. Houston, TX 77072",-95.571122,29.704095
3,2001003,S02,Thanh Son Tofu,food manufacturing,registered_address,"13574 SAN MARTIN LN HOUSTON, TX 77083",-95.623345,29.695825


In [24]:
dist_inv.head(2)

Unnamed: 0,CUST_ID,CUST__NAME,SKU_ID,sku_name,POS_Pri,POS_Sec,POS_Ter,B01_supply,B02_supply,B03_supply,S01_supply,S02_supply,S03_supply
0,2001000,Banyan Foods,1001018,tofu,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,0.0,675.463041,0.0
1,2001000,Banyan Foods,1001019,soymilk,FOOD STORES,GENERAL STORES,SPECIALTY FOODS,0.0,0.0,0.0,0.0,545.566303,0.0


In [34]:
merged_supply = pd.merge(pd.merge(table_A_df, table_B_df, on='schema_id'), table_C_df, on=['schema_id'])


---

# Feature Engineering

In [53]:
supply_categorical_columns = ['supply_desc_x','supply_desc_y','distribution','production']
demand_categorical_columns = ['POS_StateClass','POS_North American Industry Classification','POS_Revenue']

In [44]:
encoded_supply_df = pd.get_dummies(merged_supply, columns=supply_categorical_columns)

In [46]:
encoded_supply_df.head(3)

Unnamed: 0,schema_id,name_desc,avg_stock,avg_price_x,avg_mktshare,SKU_ID,sku_name,avg_price_y,CUST_ID,CUST__NAME,CUST_ADDRESS,supply_desc_x_food manufacturing,supply_desc_x_food packaging,supply_desc_y_food manufacturing,supply_desc_y_food packaging,distribution_commercial,distribution_wholesale,production_fermented,production_processed,production_unfermented
0,S01,Soybeans,802075,0.22,176456,1001020,soy nuts,4.49,2001005,Anu Resources,"11757 Katy Freeway, Suite 1300, Houston 77079 ...",0,1,0,1,1,0,0,0,1
1,S01,Soybeans,802075,0.22,176456,1001020,soy nuts,4.49,2001006,Al Hakeem Co,9318 Lynchester Dr Houston 77083 Texas United ...,0,1,0,1,1,0,0,0,1
2,S01,Soybeans,802075,0.22,176456,1001020,soy nuts,4.49,2001007,Lien Hoa B. Inc,7611 Summer Glen Ln Houston 77072 United States,0,1,0,1,1,0,0,0,1


In [54]:
encoded_demand_df = pd.get_dummies(merged_demand, columns=demand_categorical_columns)

In [55]:
encoded_demand_df.head(3)

Unnamed: 0,BLDG_ID,SqFtCd_x,SqFt_est,POS_Landuse,sectorid,POS_StateClassCd,center_x,center_y,POS_Contact,POS_Name,...,POS_StateClass_COMMERCIAL,POS_StateClass_COMMUNITY,POS_North American Industry Classification_DISTILLERIES,POS_North American Industry Classification_DRINKING PLACES ALCOHOLIC BEVERAGES,POS_North American Industry Classification_FULL-SERVICE RESTAURANTS,POS_Revenue_$1-2.5 MILLION,POS_Revenue_$2.5-5 MILLION,POS_Revenue_$5-10 MILLION,"POS_Revenue_$500,000-$1 MILLION","POS_Revenue_LESS THAN $500,000"
0,528,UNDER 5000SF,2317.5609,COMMERCIAL,2,F1,-95.3519,29.76013,IRMA G GALVAN,IRMA'S RESTAURANT,...,1,0,0,0,1,0,0,0,1,0
1,707,UNDER 5000SF,4948.907364,COMMERCIAL,4,F1,-95.3632,29.74908,LISA JUE,CHINA GARDEN,...,1,0,0,0,1,0,0,0,1,0
2,177,UNDER 5000SF,4861.121116,COMMERCIAL,10,F1,-95.3648,29.76156,ALLAN LEVINE,CULTIVATED F+B,...,1,0,0,0,1,0,1,0,0,0


---
# Splitting the Datasets

In [49]:
correlation_df = read_and_clean_csv(table_5_path)

In [50]:
correlation_df

Unnamed: 0,POS_TYPE_focus,S01,S02,S03,B01,B02,B03,Total
0,BAR,0.1,0.2,0.4,0.05,0.65,0.1,1.5
1,BEVERAGE STORES,0.0,0.0,0.1,0.0,0.0,0.0,0.1
2,BREWERY,0.0,0.0,0.1,0.0,0.0,0.05,0.15
3,CATERERS,0.25,0.25,0.25,0.25,0.25,0.25,1.5
4,CHAIN RESTAURANTS,0.0,0.1,0.4,0.65,0.0,0.0,1.15
5,ENTERTAINMENT,0.0,0.0,0.4,0.25,0.0,0.0,0.65
6,FOOD STORES,0.1,0.1,0.15,0.1,0.1,0.1,0.65
7,GENERAL STORES,0.1,0.1,0.1,0.1,0.0,0.1,0.5
8,HOTELS,0.1,0.85,0.1,0.25,0.1,0.1,1.5
9,PHARMACY STORES,0.0,0.0,0.05,0.0,0.0,0.05,0.1



Correlation values as features. This indicates the likelyhood of a buisness to have the a SKU_id under the schema_ID in stock. 

In [51]:
correlation_long = correlation_df.melt(id_vars=['POS_TYPE_focus'], var_name='schema_id', value_name='correlation')


In [57]:
merged_demand = pd.merge(encoded_demand_df, correlation_long, on=['POS_TYPE_focus'], how='left')


In [66]:
merged_demand['volume_sku'] = (
    (merged_demand['SqFt_est'] * merged_demand['correlation'])
    / (
        merged_demand['POS_Revenue_$1-2.5 MILLION'] * 100
        + merged_demand['POS_Revenue_$2.5-5 MILLION'] * 150
        + merged_demand['POS_Revenue_$5-10 MILLION'] * 200
        + merged_demand['POS_Revenue_$500,000-$1 MILLION'] * 50
        + merged_demand['POS_Revenue_LESS THAN $500,000'] * 25
    )
)

Creating the Feature Matrix 

In [80]:
merged_demand.shape

(70, 35)

In [81]:
encoded_supply_df.shape

(227, 20)

----


In [73]:
supply_features = encoded_supply_df[['schema_id','avg_stock','avg_price_x','avg_price_y', 'avg_mktshare']]

In [74]:
merged_data = pd.merge(merged_demand, supply_features, on='schema_id')

In [64]:
demand_features = merged_demand[['SqFt_est','correlation','sectorid']]

In [76]:
merged_data = pd.merge(merged_data, correlation_df, left_on='POS_TYPE_focus', right_on='POS_TYPE_focus', how='left')

In [77]:
X = pd.concat([supply_features, demand_features, merged_demand.drop(['POS_TYPE_focus'], axis=1)], axis=1)

In [78]:
target_variable = merged_demand['volume_sku']

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, target_variable, test_size=0.2, random_state=42)