In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from constants import LOGISTICS_DATA, MATERIALS_DATA, PROJECTS_DATA, SUPPLIERS_DATA

In [12]:
# Read the CSV files
materials_df = pd.read_csv(MATERIALS_DATA)
logistics_df = pd.read_csv(LOGISTICS_DATA)
projects_df = pd.read_csv(PROJECTS_DATA)
suppliers_df = pd.read_csv(SUPPLIERS_DATA)

In [14]:
# For 'distance_covered' and 'CO2_emission' in logistics data, I use median due to the skewness as demonstrated in data_ingestion.ipynb:
logistics_df['distance_covered'].fillna(logistics_df['distance_covered'].median(), inplace=True)
logistics_df['CO2_emission'].fillna(logistics_df['CO2_emission'].median(), inplace=True)

# For 'supplier_rating' in logistics data since it's missing entirely I either drop the column or fill with a default value (like median from the suppliers_df).
logistics_df['supplier_rating'].fillna(suppliers_df['supplier_rating'].median(), inplace=True)

# For 'project_budget' in projects data, I use median due to the skewness as demonstrated in data_ingestion.ipynb:
projects_df['project_budget'].fillna(projects_df['project_budget'].median(), inplace=True)

# For 'project_budget' in logistics data, we can use the corresponding value from the projects dataframe:
budget_mapping = dict(projects_df[['project_id', 'project_budget']].values)
logistics_df['project_budget'].fillna(logistics_df['project_id'].map(budget_mapping), inplace=True)



In [18]:
# checking to see if there's any null values left:
# Check for missing values and dataset length
print("Materials Data:\n", "Length:", len(materials_df), "\nMissing Values:\n", materials_df.isnull().sum(), "\n")
print("Logistics Data:\n", "Length:", len(logistics_df), "\nMissing Values:\n", logistics_df.isnull().sum(), "\n")
print("Projects Data:\n", "Length:", len(projects_df), "\nMissing Values:\n", projects_df.isnull().sum(), "\n")
print("Suppliers Data:\n", "Length:", len(suppliers_df), "\nMissing Values:\n", suppliers_df.isnull().sum(), "\n")

Materials Data:
 Length: 1000 
Missing Values:
 material_id          0
material_name        0
material_category    0
supplier_id          0
dtype: int64 

Logistics Data:
 Length: 10200 
Missing Values:
 transaction_id      0
project_id          0
material_id         0
transaction_date    0
quantity            0
transport_mode      0
distance_covered    0
CO2_emission        0
supplier_rating     0
project_budget      0
dtype: int64 

Projects Data:
 Length: 100 
Missing Values:
 project_id            0
project_name          0
project_start_date    0
project_end_date      0
project_location      0
project_budget        0
dtype: int64 

Suppliers Data:
 Length: 10 
Missing Values:
 supplier_id          0
supplier_name        0
supplier_location    0
supplier_rating      0
dtype: int64 

