In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from constants import LOGISTICS_DATA, MATERIALS_DATA, PROJECTS_DATA, SUPPLIERS_DATA

In [12]:
# Read the CSV files
materials_df = pd.read_csv(MATERIALS_DATA)
logistics_df = pd.read_csv(LOGISTICS_DATA)
projects_df = pd.read_csv(PROJECTS_DATA)
suppliers_df = pd.read_csv(SUPPLIERS_DATA)

In [14]:
# For 'distance_covered' and 'CO2_emission' in logistics data, I use median due to the skewness as demonstrated in data_ingestion.ipynb:
logistics_df['distance_covered'].fillna(logistics_df['distance_covered'].median(), inplace=True)
logistics_df['CO2_emission'].fillna(logistics_df['CO2_emission'].median(), inplace=True)

# For 'supplier_rating' in logistics data since it's missing entirely I either drop the column or fill with a default value (like median from the suppliers_df).
logistics_df['supplier_rating'].fillna(suppliers_df['supplier_rating'].median(), inplace=True)

# For 'project_budget' in projects data, I use median due to the skewness as demonstrated in data_ingestion.ipynb:
projects_df['project_budget'].fillna(projects_df['project_budget'].median(), inplace=True)

# For 'project_budget' in logistics data, we can use the corresponding value from the projects dataframe:
budget_mapping = dict(projects_df[['project_id', 'project_budget']].values)
logistics_df['project_budget'].fillna(logistics_df['project_id'].map(budget_mapping), inplace=True)



In [18]:
# checking to see if there's any null values left:
# Check for missing values and dataset length
print("Materials Data:\n", "Length:", len(materials_df), "\nMissing Values:\n", materials_df.isnull().sum(), "\n")
print("Logistics Data:\n", "Length:", len(logistics_df), "\nMissing Values:\n", logistics_df.isnull().sum(), "\n")
print("Projects Data:\n", "Length:", len(projects_df), "\nMissing Values:\n", projects_df.isnull().sum(), "\n")
print("Suppliers Data:\n", "Length:", len(suppliers_df), "\nMissing Values:\n", suppliers_df.isnull().sum(), "\n")

Materials Data:
 Length: 1000 
Missing Values:
 material_id          0
material_name        0
material_category    0
supplier_id          0
dtype: int64 

Logistics Data:
 Length: 10200 
Missing Values:
 transaction_id      0
project_id          0
material_id         0
transaction_date    0
quantity            0
transport_mode      0
distance_covered    0
CO2_emission        0
supplier_rating     0
project_budget      0
dtype: int64 

Projects Data:
 Length: 100 
Missing Values:
 project_id            0
project_name          0
project_start_date    0
project_end_date      0
project_location      0
project_budget        0
dtype: int64 

Suppliers Data:
 Length: 10 
Missing Values:
 supplier_id          0
supplier_name        0
supplier_location    0
supplier_rating      0
dtype: int64 



In [26]:
#Merging materials_df and logistics_df on common key maerial_id
final_df = pd.merge(logistics_df, materials_df, on='material_id', how='left')
final_df.head()

Unnamed: 0,transaction_id,project_id,material_id,transaction_date,quantity,transport_mode,distance_covered,CO2_emission,supplier_rating,project_budget,material_name,material_category,supplier_id
0,1,32,290,2020-01-01,2,Rail,168.038612,138.936979,4.0,248950.373569,Material_290,Binder,1
1,2,62,973,2020-01-01,74,Rail,338.227738,3824.83206,4.0,96226.481803,Material_973,Structural,5
2,3,1,447,2020-01-01,54,Truck,378.67838,3683.671229,4.0,378840.848545,Material_447,Binder,5
3,4,90,49,2020-01-01,24,Truck,190.156149,1152.4874,4.0,270037.461009,Material_49,Binder,6
4,5,52,781,2020-01-01,10,Truck,410.910826,2677.852591,4.0,152404.842241,Material_781,Binder,3


In [27]:
# Merging the final_df with suppliers_df on common key supplier_id:

final_df = pd.merge(final_df, suppliers_df, on='supplier_id', how='left')
final_df.head()

Unnamed: 0,transaction_id,project_id,material_id,transaction_date,quantity,transport_mode,distance_covered,CO2_emission,supplier_rating_x,project_budget,material_name,material_category,supplier_id,supplier_name,supplier_location,supplier_rating_y
0,1,32,290,2020-01-01,2,Rail,168.038612,138.936979,4.0,248950.373569,Material_290,Binder,1,Supplier_1,City_E,2.0
1,2,62,973,2020-01-01,74,Rail,338.227738,3824.83206,4.0,96226.481803,Material_973,Structural,5,Supplier_5,City_F,4.0
2,3,1,447,2020-01-01,54,Truck,378.67838,3683.671229,4.0,378840.848545,Material_447,Binder,5,Supplier_5,City_F,4.0
3,4,90,49,2020-01-01,24,Truck,190.156149,1152.4874,4.0,270037.461009,Material_49,Binder,6,Supplier_6,City_E,1.0
4,5,52,781,2020-01-01,10,Truck,410.910826,2677.852591,4.0,152404.842241,Material_781,Binder,3,Supplier_3,City_G,5.0


In [28]:
#Merging the final_df from cell above with projects_df on common key project_id:
final_df = pd.merge(final_df, projects_df, on='project_id', how='left')
final_df.head()

Unnamed: 0,transaction_id,project_id,material_id,transaction_date,quantity,transport_mode,distance_covered,CO2_emission,supplier_rating_x,project_budget_x,...,material_category,supplier_id,supplier_name,supplier_location,supplier_rating_y,project_name,project_start_date,project_end_date,project_location,project_budget_y
0,1,32,290,2020-01-01,2,Rail,168.038612,138.936979,4.0,248950.373569,...,Binder,1,Supplier_1,City_E,2.0,Project_32,2022-08-31,2023-02-28,City_D,248950.373569
1,2,62,973,2020-01-01,74,Rail,338.227738,3824.83206,4.0,96226.481803,...,Structural,5,Supplier_5,City_F,4.0,Project_62,2025-02-28,2025-08-28,City_A,96226.481803
2,3,1,447,2020-01-01,54,Truck,378.67838,3683.671229,4.0,378840.848545,...,Binder,5,Supplier_5,City_F,4.0,Project_1,2020-01-31,2020-07-31,City_D,378840.848545
3,4,90,49,2020-01-01,24,Truck,190.156149,1152.4874,4.0,270037.461009,...,Binder,6,Supplier_6,City_E,1.0,Project_90,2027-06-30,2027-12-30,City_D,270037.461009
4,5,52,781,2020-01-01,10,Truck,410.910826,2677.852591,4.0,152404.842241,...,Binder,3,Supplier_3,City_G,5.0,Project_52,2024-04-30,2024-10-30,City_A,152404.842241
