# Task A

In [1]:
# Importing the libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Loading the datasets

supplier_data_1 = pd.read_excel('./data/supplier_data_1.xlsx')
supplier_data_2 = pd.read_excel('./data/supplier_data_2.xlsx')

In [3]:
supplier_data_1.describe()

Unnamed: 0,Nenndicke NNN.NN mm mit Dezimalpunkt,Länge,Gewicht (kg),V-Gehalt,Cu-Gehalt,Nb-Gehalt,Ti-Gehalt,Al-Gehalt,B-Gehalt,Streckgrenze,Zugfestigkeit,Dehnung
count,86.0,86.0,86.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,371.011628,604.076558,21.215837,165.916667,311.9375,166.125,382.729167,522.083333,96.4375,327.68275,162.916375,69.416667
std,156.222277,268.103303,6.14827,283.230666,275.931722,177.230542,380.304119,198.997042,191.832252,545.942139,274.606625,135.346338
min,184.0,1.007,5.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,254.5,429.5,17.9475,20.0,110.0,14.0,24.75,377.5,0.0,0.0,0.0,0.0
50%,334.0,672.5,22.07,30.0,190.0,110.0,285.0,540.0,2.0,0.0,0.0,0.0
75%,452.0,784.5,25.175,52.5,402.5,320.0,665.0,640.0,2.25,703.25,442.5,12.5
max,885.0,995.0,31.84,1160.0,850.0,471.0,1165.0,870.0,680.0,1420.0,918.0,500.0


In [4]:
# Discovering that some chemical composition columns like 'Si-Gehalt' is of object type, so we have to convert them all to numeric

supplier_data_1['Si-Gehalt']

0        NaN
1        NaN
2     0.2540
3     0.2250
4        NaN
       ...  
81        10
82        10
83        10
84    0.1870
85    0.2250
Name: Si-Gehalt, Length: 86, dtype: object

- It can be observed that there were some columns containing chemical content that weren't of numeric data type. So, we have to take care of that.

- It can also be seen that some values were really large, like 10, and some were in percentages like 0.187 and 0.2250. So, if value > 10, we considered it ppm and divivded it by 10000 for percentage. Otherwise, we left it as it is.

In [5]:
# Defining the chemical composition columns

supplier_data_1_chemical_columns = [
        'Si-Gehalt', 'Mn-Gehalt', 'P-Gehalt', 'S-Gehalt',
        'Cu-Gehalt', 'Nb-Gehalt', 'Ti-Gehalt', 'Al-Gehalt', 'B-Gehalt'
        'Cr-Gehalt', 'Ni-Gehalt', 'Mo-Gehalt', 'V-Gehalt',
    ]

In [6]:
# Function to handle PPM values in the chemical composition columns

def handle_ppm_values(series):
    
    def convert_value(val_str):
        if pd.isna(val_str):
            return val_str
        
        try:
            val = float(val_str)
            if val > 10:
                return str(val / 10000) 
            else:
                return val_str
        except (ValueError, TypeError):
            return np.nan
    
    return series.apply(convert_value)

In [7]:
# Function to clean mixed-type chemical composition columns

def clean_mixed_chemical_columns(df, chemical_columns):
    
    df_clean = df.copy()
    
    for col in chemical_columns:
        if col in df_clean.columns:
            print(f"\nProcessing column: {col}")
            
    
            original_series = df_clean[col]
            print(f"Original dtype: {original_series.dtype}")
            print(f"Unique value types: {set(type(x).__name__ for x in original_series.dropna())}")
            print(f"Sample values: {original_series.dropna().head().tolist()}")
            
            clean_series = original_series.astype(str)
            
            missing_representations = ['nan', '', 'None', 'null', 'NULL', 'NaN', 'NAN']
            clean_series = clean_series.replace(missing_representations, np.nan)
            

            mask = clean_series.notna()
            clean_series.loc[mask] = clean_series.loc[mask].str.replace(',', '.')  # German decimal
            clean_series.loc[mask] = clean_series.loc[mask].str.replace(' ', '')   # Remove spaces
            clean_series.loc[mask] = clean_series.loc[mask].str.strip()           # Remove leading/trailing spaces
            
            
            if col in ['V-Gehalt', 'Cu-Gehalt', 'Nb-Gehalt', 'Ti-Gehalt', 'Al-Gehalt', 'B-Gehalt']:
            
                clean_series = handle_ppm_values(clean_series)
            
            numeric_series = pd.to_numeric(clean_series, errors='coerce')
            
            
            suspicious_mask = (numeric_series > 100) | (numeric_series < 0)
            if suspicious_mask.any():
                print(f"Warning: {suspicious_mask.sum()} values outside normal range (0-100%)")
                print(f"Suspicious values: {numeric_series[suspicious_mask].tolist()}")
            
            
            df_clean[col] = numeric_series
            
            
            print(f"Final dtype: {df_clean[col].dtype}")
            print(f"Non-null values: {df_clean[col].notna().sum()}/{len(df_clean)}")
            print(f"Value range: {df_clean[col].min():.4f} - {df_clean[col].max():.4f}")
    
    return df_clean

In [8]:
# Cleaning the chemical composition columns in supplier_data_1

supplier_data_1_clean = clean_mixed_chemical_columns(supplier_data_1, supplier_data_1_chemical_columns)


Processing column: Si-Gehalt
Original dtype: object
Unique value types: {'str'}
Sample values: ['0.2540', '0.2250', '10', '0.2330', '0.1190']
Final dtype: float64
Non-null values: 48/86
Value range: 0.0000 - 19.0000

Processing column: Mn-Gehalt
Original dtype: object
Unique value types: {'str'}
Sample values: ['1.2780', '1.0630', 'A', '1.0290', '0.7440']
Final dtype: float64
Non-null values: 38/86
Value range: 0.0000 - 1.8850

Processing column: P-Gehalt
Original dtype: object
Unique value types: {'str'}
Sample values: ['0.0080', '0.0100', '610', '0.0090', '0.0130']
Suspicious values: [610.0, 610.0, 610.0, 610.0, 610.0, 610.0, 610.0, 610.0, 610.0, 610.0]
Final dtype: float64
Non-null values: 48/86
Value range: 0.0000 - 610.0000

Processing column: S-Gehalt
Original dtype: object
Unique value types: {'str'}
Sample values: ['0.0010', '0.0020', 'technologische Werte (WBB)', '0.0040', '0.0040']
Final dtype: float64
Non-null values: 38/86
Value range: 0.0000 - 0.0130

Processing column: C

In [9]:
# Checking for missing values in supplier_data_1

supplier_data_1.isna().sum()

Werksgüte                               20
Bestellgütentext                        16
Nenndicke NNN.NN mm mit Dezimalpunkt     0
Breite                                   0
Länge                                    0
Gewicht (kg)                             0
Cluster                                 12
Si-Gehalt                               31
Mn-Gehalt                               31
P-Gehalt                                31
S-Gehalt                                38
Cr-Gehalt                               31
Ni-Gehalt                               39
Mo-Gehalt                               38
V-Gehalt                                38
Cu-Gehalt                               38
Nb-Gehalt                               38
Ti-Gehalt                               38
Al-Gehalt                               38
B-Gehalt                                38
Streckgrenze                            38
Zugfestigkeit                           38
Dehnung                                 38
dtype: int6

- We observed that in most columns, some values were missing. We decided to remove a column if it had more than 70% values missing.

In [10]:
# Rechecking to make sure all chemical composition columns are now numeric

supplier_data_1_clean.dtypes

Werksgüte                                object
Bestellgütentext                         object
Nenndicke NNN.NN mm mit Dezimalpunkt      int64
Breite                                   object
Länge                                   float64
Gewicht (kg)                            float64
Cluster                                  object
Si-Gehalt                               float64
Mn-Gehalt                               float64
P-Gehalt                                float64
S-Gehalt                                float64
Cr-Gehalt                                object
Ni-Gehalt                               float64
Mo-Gehalt                               float64
V-Gehalt                                float64
Cu-Gehalt                               float64
Nb-Gehalt                               float64
Ti-Gehalt                               float64
Al-Gehalt                               float64
B-Gehalt                                float64
Streckgrenze                            

In [11]:
# Separating numeric and object columns for further analysis

supplier_data_1_clean_num_columns = supplier_data_1_clean.select_dtypes(include=['number']).columns
supplier_data_1_clean_obj_columns = supplier_data_1_clean.select_dtypes(include=['object']).columns

In [12]:
supplier_data_1_clean_num_columns

Index(['Nenndicke NNN.NN mm mit Dezimalpunkt', 'Länge', 'Gewicht (kg)',
       'Si-Gehalt', 'Mn-Gehalt', 'P-Gehalt', 'S-Gehalt', 'Ni-Gehalt',
       'Mo-Gehalt', 'V-Gehalt', 'Cu-Gehalt', 'Nb-Gehalt', 'Ti-Gehalt',
       'Al-Gehalt', 'B-Gehalt', 'Streckgrenze', 'Zugfestigkeit', 'Dehnung'],
      dtype='object')

In [13]:
supplier_data_1_clean_obj_columns

Index(['Werksgüte', 'Bestellgütentext', 'Breite', 'Cluster', 'Cr-Gehalt'], dtype='object')

-  We decided to impute the missing values for object dtype to `Unknown`, and use the mean for the numeric dtype.

In [14]:
# Imputing missing values in numeric columns with the mean of each column, and in object columns with 'Unknown'

supplier_data_1_clean[supplier_data_1_clean_num_columns] = supplier_data_1_clean[supplier_data_1_clean_num_columns].fillna(
    supplier_data_1_clean[supplier_data_1_clean_num_columns].mean()
)
supplier_data_1_clean[supplier_data_1_clean_obj_columns] = supplier_data_1_clean[supplier_data_1_clean_obj_columns].fillna('Unknown')

- Next, we decided to follow the English naming convention for the column names, so we created a dict to map from German to English. 

In [15]:
# Renaming columns from German to English for consistency and better understanding

german_english_column_mapping = {
    'Werksgüte': 'material_grade',
    'Bestellgütentext': 'material_specification',
    'Nenndicke NNN.NN mm mit Dezimalpunkt': 'thickness_mm',
    'Breite': 'width_mm',
    'Länge': 'length_mm',
    'Gewicht (kg)': 'weight_kg',
    'Cluster': 'material_cluster',
    'Si-Gehalt': 'silicon_content',
    'Mn-Gehalt': 'manganese_content',
    'P-Gehalt': 'phosphorus_content',
    'S-Gehalt': 'sulfur_content',
    'Cr-Gehalt': 'chromium_content',
    'Ni-Gehalt': 'nickel_content',
    'Mo-Gehalt': 'molybdenum_content',
    'V-Gehalt': 'vanadium_content',
    'Cu-Gehalt': 'copper_content',
    'Nb-Gehalt': 'niobium_content',
    'Ti-Gehalt': 'titanium_content',
    'Al-Gehalt': 'aluminum_content',
    'B-Gehalt': 'boron_content',
    'Streckgrenze': 'yield_strength',
    'Zugfestigkeit': 'tensile_strength',
    'Dehnung': 'elongation'
}

In [16]:
# Renaming the columns in the cleaned DataFrame

supplier_data_1_clean.rename(columns=german_english_column_mapping, inplace=True)

- Moving forward, we decided to use `MinMaxScaler()` to normalize all the numeric columns, as the ranges differed across them.

In [17]:
# Normalizing numeric columns to a 0-1 range using Min-Max scaling

supplier_data_1_clean = supplier_data_1_clean.apply(lambda x: pd.Series(MinMaxScaler().fit_transform(x.dropna().values.reshape(-1, 1)).flatten(), index=x.dropna().index).reindex(x.index) \
                          if x.dtype != 'object' and x.notna().sum() > 1 else x)

- Next, we decided to create a feature called `source` to identify the first supplier.

In [18]:
# Adding a source column to identify the dataset origin

supplier_data_1_clean['source'] = 'supplier_1'
supplier_data_1_clean = supplier_data_1_clean[['source'] + [col for col in supplier_data_1_clean.columns if col != 'source']]

- Now, we go through the second dataset `supplier_data_2.xlsx`.

In [19]:
# Going through supplier_data_2 to understand its structure and content

supplier_data_2.describe()

Unnamed: 0,ORDER_ID,MATERIAL_NUMBER,SURFACE_COATING,NOMINAL_THICKNESS_MM,WIDTH_MM,LENGTH_MM,HEIGHT_MM,MASS_MIN_KG,NUMBER_OF_COILS,DELIVERY_EARLIEST,DELIVERY_LATEST,BUY_NOW_EUR_PER_TON,MIN/MAX_BID_EUR_PER_TON,CO2_PER_TON_MAX_KG
count,136.0,91.0,0.0,136.0,136.0,55.0,0.0,136.0,0.0,0.0,0.0,39.0,125.0,0.0
mean,436447.602941,1.059996,,3.622728,1302.830882,2674.290909,,4349.507353,,,,618.461538,548.72,
std,214.598204,0.042435,,3.886831,253.536689,416.771794,,4303.138128,,,,20.201415,46.263202,
min,436125.0,1.0038,,0.64,812.0,1974.0,,721.0,,,,600.0,490.0,
25%,436278.75,1.0045,,0.71275,1132.25,2356.0,,2155.25,,,,600.0,520.0,
50%,436524.5,1.0873,,1.7,1385.0,2760.0,,2810.5,,,,600.0,550.0,
75%,436622.25,1.0976,,7.0465,1503.0,3008.0,,3553.5,,,,640.0,610.0,
max,436766.0,1.1191,,12.029,1676.0,3802.0,,23111.0,,,,640.0,620.0,


- Performing all the steps as we did for `supplier_data_1.xlsx`.

In [20]:
# Checking for missing values in supplier_data_2

supplier_data_2.isna().sum()

PRODUCT_TYPE                18
ORDER_ID                     0
SITE                         0
MATERIAL_NAME               20
MATERIAL_NUMBER             45
MATERIAL_QUALITY_NORM       17
SURFACE_COATING            136
DEFECT_NOTES                20
NOMINAL_THICKNESS_MM         0
WIDTH_MM                     0
LENGTH_MM                   81
HEIGHT_MM                  136
MASS_MIN_KG                  0
NUMBER_OF_COILS            136
DELIVERY_EARLIEST          136
DELIVERY_LATEST            136
INCO_TERM                   28
BUY_NOW_EUR_PER_TON         97
MIN/MAX_BID_EUR_PER_TON     11
CO2_PER_TON_MAX_KG         136
VALID_UNTIL                  0
dtype: int64

In [21]:
# Dropping columns with more than 70% missing values

supplier_2_cols_to_drop = [col for col in supplier_data_2.columns if supplier_data_2[col].isna().sum() > 0.7 * len(supplier_data_2)]

In [22]:
supplier_2_cols_to_drop

['SURFACE_COATING',
 'HEIGHT_MM',
 'NUMBER_OF_COILS',
 'DELIVERY_EARLIEST',
 'DELIVERY_LATEST',
 'BUY_NOW_EUR_PER_TON',
 'CO2_PER_TON_MAX_KG']

In [23]:
# Dropping the identified columns from supplier_data_2

supplier_data_2.drop(columns=supplier_2_cols_to_drop, inplace=True)

In [24]:
# Lowering all column names to ensure consistent naming standards

supplier_data_2_renamed_cols = [col.lower() for col in supplier_data_2.columns]
supplier_data_2.columns = supplier_data_2_renamed_cols

In [25]:
supplier_data_2

Unnamed: 0,product_type,order_id,site,material_name,material_number,material_quality_norm,defect_notes,nominal_thickness_mm,width_mm,length_mm,mass_min_kg,inco_term,min/max_bid_eur_per_ton,valid_until
0,SHEET,436765,1 company gmbh,S235JR,1.0038,DIN EN 10025,DEKL-S235JR / D2A EID,11.859,1509,3008.0,2091,FCA,,20/02/2025 11:00
1,SHEET,436754,1 company gmbh,S355MC,1.0976,DIN EN 10149,,8.057,1011,2355.0,2411,FCA,,20/02/2025 11:00
2,SHEET,436755,1 company gmbh,,1.0976,DIN EN 10149,DEKL-S355MC / D2A WEH,8.057,1010,2356.0,2251,FKA,,2025-02-20 11:00
3,SHEET,436757,1 company gmbh,S355MC,1.0976,,DEKL-S355MC / D2A WEH,8.057,1011,2356.0,2401,FCA,,2025-02-20 11:00
4,SHEET,436758,1 company gmbh,S355MC,1.0976,DN EN 10149,DEKL-S355MC / D2A WEH,8.057,1010,2356.0,2401,FCA,,2025-02-20 11:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,COIL_STRIP,436737,1 company gmbh,,1.0873,,DC06 / D2A ZUB,0.800,1385,,4580,FCA,570.0,2025-02-20 11:00
132,SHEET,436765,1 company gmbh,S235JR,1.0038,DIN EN 10025,DEKL-S235JR / D2A EID,11.859,1509,3008.0,2091,FCA,,2025-02-20 11:00
133,COIL_STRIP,436283,1 company gmbh,CR180BH,,VDA 239-100,CR180BHZM40/40-E ZM 90 MC OL / D2A VMB,0.712,1432,,3160,FCA,490.0,2025-02-20 16:00
134,SHET,436626,1 company gmbh,,1.0045,DIN EN 10025,,12.008,1507,2506.0,3541,FCA,610.0,2025-02-20 11:00


In [26]:
# Separating numeric and object columns in supplier_data_2 for further analysis

supplier_data_2_obj_cols = supplier_data_2.select_dtypes(include=['object']).columns
supplier_data_2_num_cols = supplier_data_2.select_dtypes(include=['number']).columns

In [27]:
# Imputing missing values in object columns with 'Unknown' and numeric columns with the mean of each column

supplier_data_2[supplier_data_2_obj_cols] = supplier_data_2[supplier_data_2_obj_cols].fillna('Unknown')
supplier_data_2[supplier_data_2_num_cols] = supplier_data_2[supplier_data_2_num_cols].fillna(
    supplier_data_2[supplier_data_2_num_cols].mean()
)

In [28]:
supplier_data_2

Unnamed: 0,product_type,order_id,site,material_name,material_number,material_quality_norm,defect_notes,nominal_thickness_mm,width_mm,length_mm,mass_min_kg,inco_term,min/max_bid_eur_per_ton,valid_until
0,SHEET,436765,1 company gmbh,S235JR,1.003800,DIN EN 10025,DEKL-S235JR / D2A EID,11.859,1509,3008.000000,2091,FCA,548.72,20/02/2025 11:00
1,SHEET,436754,1 company gmbh,S355MC,1.097600,DIN EN 10149,Unknown,8.057,1011,2355.000000,2411,FCA,548.72,20/02/2025 11:00
2,SHEET,436755,1 company gmbh,Unknown,1.097600,DIN EN 10149,DEKL-S355MC / D2A WEH,8.057,1010,2356.000000,2251,FKA,548.72,2025-02-20 11:00
3,SHEET,436757,1 company gmbh,S355MC,1.097600,Unknown,DEKL-S355MC / D2A WEH,8.057,1011,2356.000000,2401,FCA,548.72,2025-02-20 11:00
4,SHEET,436758,1 company gmbh,S355MC,1.097600,DN EN 10149,DEKL-S355MC / D2A WEH,8.057,1010,2356.000000,2401,FCA,548.72,2025-02-20 11:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,COIL_STRIP,436737,1 company gmbh,Unknown,1.087300,Unknown,DC06 / D2A ZUB,0.800,1385,2674.290909,4580,FCA,570.00,2025-02-20 11:00
132,SHEET,436765,1 company gmbh,S235JR,1.003800,DIN EN 10025,DEKL-S235JR / D2A EID,11.859,1509,3008.000000,2091,FCA,548.72,2025-02-20 11:00
133,COIL_STRIP,436283,1 company gmbh,CR180BH,1.059996,VDA 239-100,CR180BHZM40/40-E ZM 90 MC OL / D2A VMB,0.712,1432,2674.290909,3160,FCA,490.00,2025-02-20 16:00
134,SHET,436626,1 company gmbh,Unknown,1.004500,DIN EN 10025,Unknown,12.008,1507,2506.000000,3541,FCA,610.00,2025-02-20 11:00


In [29]:
# Normalizing numeric columns in supplier_data_2 to a 0-1 range using Min-Max scaling

supplier_data_2 = supplier_data_2.apply(lambda x: pd.Series(MinMaxScaler().fit_transform(x.dropna().values.reshape(-1, 1)).flatten(), index=x.dropna().index).reindex(x.index) \
                          if x.dtype != 'object' and x.notna().sum() > 1 else x)

In [30]:
supplier_data_2

Unnamed: 0,product_type,order_id,site,material_name,material_number,material_quality_norm,defect_notes,nominal_thickness_mm,width_mm,length_mm,mass_min_kg,inco_term,min/max_bid_eur_per_ton,valid_until
0,SHEET,0.998440,1 company gmbh,S235JR,0.000000,DIN EN 10025,DEKL-S235JR / D2A EID,0.985073,0.806713,0.565646,0.061188,FCA,0.451692,20/02/2025 11:00
1,SHEET,0.981279,1 company gmbh,S355MC,0.813530,DIN EN 10149,Unknown,0.651242,0.230324,0.208425,0.075480,FCA,0.451692,20/02/2025 11:00
2,SHEET,0.982839,1 company gmbh,Unknown,0.813530,DIN EN 10149,DEKL-S355MC / D2A WEH,0.651242,0.229167,0.208972,0.068334,FKA,0.451692,2025-02-20 11:00
3,SHEET,0.985959,1 company gmbh,S355MC,0.813530,Unknown,DEKL-S355MC / D2A WEH,0.651242,0.230324,0.208972,0.075033,FCA,0.451692,2025-02-20 11:00
4,SHEET,0.987520,1 company gmbh,S355MC,0.813530,DN EN 10149,DEKL-S355MC / D2A WEH,0.651242,0.229167,0.208972,0.075033,FCA,0.451692,2025-02-20 11:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,COIL_STRIP,0.954758,1 company gmbh,Unknown,0.724198,Unknown,DC06 / D2A ZUB,0.014049,0.663194,0.383091,0.172354,FCA,0.615385,2025-02-20 11:00
132,SHEET,0.998440,1 company gmbh,S235JR,0.000000,DIN EN 10025,DEKL-S235JR / D2A EID,0.985073,0.806713,0.565646,0.061188,FCA,0.451692,2025-02-20 11:00
133,COIL_STRIP,0.246490,1 company gmbh,CR180BH,0.487386,VDA 239-100,CR180BHZM40/40-E ZM 90 MC OL / D2A VMB,0.006322,0.717593,0.383091,0.108933,FCA,0.000000,2025-02-20 16:00
134,SHET,0.781591,1 company gmbh,Unknown,0.006071,DIN EN 10025,Unknown,0.998156,0.804398,0.291028,0.125949,FCA,0.923077,2025-02-20 11:00


- We assume the `weight_kg` in the `supplier_data_1` dataset to be the same as the `mass_min_kg` in the `supplier_data_2`.

In [31]:
# Copying and adding a source column to identify the dataset origin

supplier_data_2_clean = supplier_data_2.copy()
supplier_data_2_clean['source'] = supplier_data_2_clean['site']
supplier_data_2_clean['weight_kg'] = supplier_data_2_clean['mass_min_kg']
supplier_data_2_clean.drop(columns=['mass_min_kg', 'site'], inplace=True)
supplier_data_2_clean = supplier_data_2_clean[['source'] + [col for col in supplier_data_2_clean.columns if col != 'source']]

In [32]:
supplier_data_2_clean['weight_kg']

0      0.061188
1      0.075480
2      0.068334
3      0.075033
4      0.075033
         ...   
131    0.172354
132    0.061188
133    0.108933
134    0.125949
135    0.000000
Name: weight_kg, Length: 136, dtype: float64

- Now, we concatenate the data to get a single inventory dataset.

In [33]:
# Concatenating the cleaned datasets from both suppliers into a single inventory dataset

inventory_dataset = pd.concat([supplier_data_1_clean, supplier_data_2_clean], ignore_index=True)

In [34]:
inventory_dataset

Unnamed: 0,source,material_grade,material_specification,thickness_mm,width_mm,length_mm,weight_kg,material_cluster,silicon_content,manganese_content,...,product_type,order_id,material_name,material_number,material_quality_norm,defect_notes,nominal_thickness_mm,inco_term,min/max_bid_eur_per_ton,valid_until
0,supplier_1,G2UB5,SZBS800,0.194009,856.0,0.790743,0.416128,WB-G,0.129978,0.618582,...,,,,,,,,,,
1,supplier_1,G2UJ5,SZBS800,0.221113,918.0,0.710260,0.441613,WB-G,0.129978,0.618582,...,,,,,,,,,,
2,supplier_1,C3318,LICRO 500,0.382311,1839.0,0.300800,0.500190,WB-U,0.013368,0.677984,...,,,,,,,,,,
3,supplier_1,C3U15,S380MC mod. 4,0.496434,1160.0,0.462773,0.626132,WB-U,0.011842,0.563926,...,,,,,,,,,,
4,supplier_1,G3UB5,SZBE800,0.380884,727.2,0.559353,0.322176,Spaltband,0.129978,0.618582,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,1 company gmbh,,,,0.663194,0.383091,0.172354,,,,...,COIL_STRIP,0.954758,Unknown,0.724198,Unknown,DC06 / D2A ZUB,0.014049,FCA,0.615385,2025-02-20 11:00
218,1 company gmbh,,,,0.806713,0.565646,0.061188,,,,...,SHEET,0.998440,S235JR,0.000000,DIN EN 10025,DEKL-S235JR / D2A EID,0.985073,FCA,0.451692,2025-02-20 11:00
219,1 company gmbh,,,,0.717593,0.383091,0.108933,,,,...,COIL_STRIP,0.246490,CR180BH,0.487386,VDA 239-100,CR180BHZM40/40-E ZM 90 MC OL / D2A VMB,0.006322,FCA,0.000000,2025-02-20 16:00
220,1 company gmbh,,,,0.804398,0.291028,0.125949,,,,...,SHET,0.781591,Unknown,0.006071,DIN EN 10025,Unknown,0.998156,FCA,0.923077,2025-02-20 11:00


- Taking similar steps for `inventory_dataset`, like we did for the previous datasets, such as imputing missing values.

In [35]:
# Separating numeric and object columns in the combined inventory dataset for further analysis

inventory_dataset_num_cols = inventory_dataset.select_dtypes(include=['number']).columns
inventory_dataset_obj_cols = inventory_dataset.select_dtypes(include=['object']).columns

In [36]:
# Imputing missing values in numeric columns with the mean of each column and in object columns with 'Unknown'

inventory_dataset[inventory_dataset_num_cols] = inventory_dataset[inventory_dataset_num_cols].fillna(inventory_dataset[inventory_dataset_num_cols].mean())
inventory_dataset[inventory_dataset_obj_cols] = inventory_dataset[inventory_dataset_obj_cols].fillna('Unknown')

In [37]:
inventory_dataset_cleaned = inventory_dataset.copy()

In [38]:
# Saving the cleaned and combined inventory dataset to a CSV file

inventory_dataset_cleaned.to_csv('./inventory_dataset.csv', index=False)