# Task 1. Data Cleaning & Preparation

Importing appropriate `Python` packages.

In [296]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Loading the datasets

In [297]:
dataset1_file = "supplier_data_1.xlsx"
dataset2_file = "supplier_data_2.xlsx"

df1 = pd.read_excel(dataset1_file)
df2 = pd.read_excel(dataset2_file)

In [298]:
df1.rename(columns={
    'Werksgüte': 'Quality of Material',
    'Bestellgütentext': 'Order Grade Text',
    'Nenndicke NNN.NN mm mit Dezimalpunkt': 'Nominal Thickness (mm with decimal point)',
    'Breite': 'Width',
    'Länge': 'Length',
    'Gewicht (kg)': 'Weight (kg)',
    'Cluster': 'Cluster',
    'Si-Gehalt': 'Silicon Content',
    'Mn-Gehalt': 'Manganese Content',
    'P-Gehalt': 'Phosphorus Content',
    'S-Gehalt': 'Sulfur Content',
    'Cr-Gehalt': 'Chromium Content',
    'Ni-Gehalt': 'Nickel Content',
    'Mo-Gehalt': 'Molybdenum Content',
    'V-Gehalt': 'Vanadium Content',
    'Cu-Gehalt': 'Copper Content',
    'Nb-Gehalt': 'Niobium Content',
    'Ti-Gehalt': 'Titanium Content',
    'Al-Gehalt': 'Aluminum Content',
    'B-Gehalt': 'Boron Content',
    'Streckgrenze': 'Yield Strength',
    'Zugfestigkeit': 'Tensile Strength',
    'Dehnung': 'Elongation'
}, inplace=True)

### Data Exploration

##### supplier_data_1.xlsx

In [299]:
df1.head()

Unnamed: 0,Quality of Material,Order Grade Text,Nominal Thickness (mm with decimal point),Width,Length,Weight (kg),Cluster,Silicon Content,Manganese Content,Phosphorus Content,...,Molybdenum Content,Vanadium Content,Copper Content,Niobium Content,Titanium Content,Aluminum Content,Boron Content,Yield Strength,Tensile Strength,Elongation
0,G2UB5,SZBS800,320,856.0,787.0,16.49,WB-G,,,,...,,,,,,,,,,
1,G2UJ5,SZBS800,339,918.0,707.0,17.16,WB-G,,,,...,,,,,,,,,,
2,C3318,LICRO 500,452,1839.0,300.0,18.7,WB-U,0.254,1.278,0.008,...,0.009,20.0,290.0,12.0,320.0,320.0,25.0,0.0,0.0,0.0
3,C3U15,S380MC mod. 4,532,1160.0,461.0,22.011,WB-U,0.225,1.063,0.01,...,0.006,30.0,150.0,320.0,989.0,430.0,1.0,0.0,0.0,0.0
4,G3UB5,SZBE800,451,727.2,557.0,14.02,Spaltband,,,,...,,,,,,,,,,


In [300]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 23 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Quality of Material                        66 non-null     object 
 1   Order Grade Text                           70 non-null     object 
 2   Nominal Thickness (mm with decimal point)  86 non-null     int64  
 3   Width                                      86 non-null     object 
 4   Length                                     86 non-null     float64
 5   Weight (kg)                                86 non-null     float64
 6   Cluster                                    74 non-null     object 
 7   Silicon Content                            55 non-null     object 
 8   Manganese Content                          55 non-null     object 
 9   Phosphorus Content                         55 non-null     object 
 10  Sulfur Content              

This shows that there are multiple missing values in the dataset for many columns.

### Data Cleaning

Finding missing values for each column

In [301]:
df1.isnull().sum()

Quality of Material                          20
Order Grade Text                             16
Nominal Thickness (mm with decimal point)     0
Width                                         0
Length                                        0
Weight (kg)                                   0
Cluster                                      12
Silicon Content                              31
Manganese Content                            31
Phosphorus Content                           31
Sulfur Content                               38
Chromium Content                             31
Nickel Content                               39
Molybdenum Content                           38
Vanadium Content                             38
Copper Content                               38
Niobium Content                              38
Titanium Content                             38
Aluminum Content                             38
Boron Content                                38
Yield Strength                          

Let's start handling missing values for each column one by one or couple.

In [302]:
df1[["Quality of Material", "Order Grade Text"]].value_counts()

Quality of Material  Order Grade Text
G2UB5                HR660Y760T-CP       9
G2UJ5                SZBS800             6
C27L5                50CrMo4             3
G2UB5                2A Lager            3
C4LF5                42CrMo4             2
C2225                DP600-MW06          2
C3U15                S380MC mod. 4       2
C2WS8                C2WS8               2
G2UB5                SZBS800             2
G2WD8                D7G                 2
X2UJ5                SZBS800             2
C27W8                RobuSal®800         1
C2UG8                CR330Y590T-DP       1
C27N5                51CrV4              1
C1F28                CR300LA             1
C1DE8                CR340LA             1
CBWW5                HC460LA             1
C3LZ5                67CrNiMo33          1
C37N5                51CrV4              1
C3LF5                42CrMo4             1
C3318                LICRO 500           1
C27W8                DBL 4525.30         1
G2UB5           

In [303]:
df1[["Quality of Material", "Order Grade Text"]].head(5)

Unnamed: 0,Quality of Material,Order Grade Text
0,G2UB5,SZBS800
1,G2UJ5,SZBS800
2,C3318,LICRO 500
3,C3U15,S380MC mod. 4
4,G3UB5,SZBE800


As both columns are related, it is essential to impute the missing values based on frequest vice-versa values. The main reason behind imputing this way is because they are correlated though they are categorical features.

In [304]:
# Step 1: Impute missing 'Werksgüte' based on 'Bestellgütentext'
df1['Quality of Material'] = df1.groupby('Order Grade Text')['Quality of Material'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Others'))

# Step 2: Impute missing 'Bestellgütentext' based on 'Werksgüte'
df1['Order Grade Text'] = df1.groupby('Quality of Material')['Order Grade Text'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Others'))

# Step 3: Handle rows where both 'Werksgüte' and 'Bestellgütentext' are missing
df1['Quality of Material'].fillna('Others', inplace=True)
df1['Order Grade Text'].fillna('Others', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['Quality of Material'].fillna('Others', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['Order Grade Text'].fillna('Others', inplace=True)


In [305]:
df1[["Quality of Material", "Order Grade Text"]].head(5)

Unnamed: 0,Quality of Material,Order Grade Text
0,G2UB5,SZBS800
1,G2UJ5,SZBS800
2,C3318,LICRO 500
3,C3U15,S380MC mod. 4
4,G3UB5,SZBE800


In [41]:
df1["Cluster"].values

array(['WB-G', 'WB-G', 'WB-U', 'WB-U', 'Spaltband', 'Spaltband',
       'ELO verzinkt', 'WB-G', nan, nan, 'WB-U', 'WB-U', nan, 'WB-U',
       'WB-U', nan, 'WB-U', 'WB-U', 'WB-U', 'WB-U', 'WB-U', 'WB-U',
       'WB-U', 'WBU', 'WBU', nan, 'WB-U', 'WB-U', 'WB-U', 'WB-U', 'WB-U',
       'WB-U', 'WB-U', 'WB-U', 'WB-U', 'WB-U', 'WB-U', 'WB-U', 'WB-U',
       'WB-U', nan, 'WB-U', 'WB-G', 'WB-G', 'WB-G', nan, 'WB-G', 'WB-G',
       'WB-G', nan, nan, 'WB-G', 'WB-G', 'WB-G', 'WB-G', 'WB-G', 'WB-G',
       'WB-G', 'WB-G', 'WB-G', 'WB-G', 'WB-G', 'WB-G', 'WB-G', 'WB-G',
       nan, 'WB-G', 'WB-G', nan, nan, 'Kaltfeinblech', 'Kaltfeinblech',
       'Kaltfeinblech', 'Kaltfeinblech', 'Kaltfeinblech', 'ELO verzinkt',
       'ELO verzinkt', 'ELO verzinkt', 'ELO verzinkt', 'ELO verzinkt',
       'ELO verzinkt', 'ELO verzinkt', 'ELO verzinkt', 'ELO verzinkt',
       'WB-U', 'WB-U'], dtype=object)

In [306]:
df1[["Quality of Material", "Order Grade Text", "Cluster"]].values

array([['G2UB5', 'SZBS800', 'WB-G'],
       ['G2UJ5', 'SZBS800', 'WB-G'],
       ['C3318', 'LICRO 500', 'WB-U'],
       ['C3U15', 'S380MC mod. 4', 'WB-U'],
       ['G3UB5', 'SZBE800', 'Spaltband'],
       ['G2UB5', 'SZBS800', 'Spaltband'],
       ['G2UB5', '2A Lager', 'ELO verzinkt'],
       ['C27L5', '50CrMo4', 'WB-G'],
       ['C2WS8', 'C2WS8', nan],
       ['C1F28', 'CR300LA', nan],
       ['Others', 'Others', 'WB-U'],
       ['C2WS8', 'C2WS8', 'WB-U'],
       ['Others', 'Others', nan],
       ['G3K88', 'G3K88', 'WB-U'],
       ['Others', 'Others', 'WB-U'],
       ['Others', 'G1BX5', nan],
       ['G37M5', '58CrV4', 'WB-U'],
       ['C2UG8', 'CR330Y590T-DP', 'WB-U'],
       ['G2UB5', 'HR660Y760T-CP', 'WB-U'],
       ['G2UB5', 'HR660Y760T-CP', 'WB-U'],
       ['G2UB5', 'HR660Y760T-CP', 'WB-U'],
       ['G2UB5', 'HR660Y760T-CP', 'WB-U'],
       ['G2UB5', 'HR660Y760T-CP', 'WB-U'],
       ['G2UB5', 'HR660Y760T-CP', 'WBU'],
       ['G2WD8', 'D7G', 'WBU'],
       ['C4LF5', '42CrMo4', nan]

In [49]:
df1["Cluster"].value_counts()

Cluster
WB-U             32
WB-G             25
ELO verzinkt     10
Kaltfeinblech     5
Spaltband         2
Name: count, dtype: int64

In [48]:
df1['Cluster'] = df1['Cluster'].replace('WBU', 'WB-U')

Imputing the values of 'Cluster' based on 'Quality of Material' and 'Order Grade Text' as they are related and having patterns.

In [307]:
# Impute missing 'Cluster' values based on the combination of 'Werksgüte' and 'Bestellgütentext'
df1['Cluster'] = df1.groupby(['Quality of Material', 'Order Grade Text'])['Cluster'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'WB-U')
)

df1['Cluster'] = df1['Cluster'].fillna('WB-U')

In [308]:
df1["Cluster"].value_counts()

Cluster
WB-U             35
WB-G             30
ELO verzinkt     10
Kaltfeinblech     7
Spaltband         2
WBU               2
Name: count, dtype: int64

In [309]:
df1["Silicon Content"].values

array([nan, nan, '0.2540', '0.2250', nan, nan, '10', nan, '0.2330',
       '0.1190', '0', '0.2650', '0.2920', '0.2520', '0.2280', '0.0700',
       '0.2200', '0.2490', '0.4630', '0.4470', '0.4470', '0.4470',
       '0.4470', '0.4470', '0.2620', '0.1640', '0.2580', '0.2350',
       '0.1870', '0.1870', '0.1870', '0.1870', '0.2450', '0.1980',
       '0.2540', '0.2490', '0.0590', '0.4370', '0.4370', '0.0120',
       '0.2760', '0.1890', nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, 'Rost flächig (UNB)',
       'Auftragsstreichung (VRK)', 'Rost flächig (DR3)',
       'lange Schale (TS)', 'Dickenschwankungen (TAN)',
       'Dickenschw. wegen Stop Tandemstr. (TAN)',
       'Dickenschwankungen (TAN)', '19', '10', '10', '10', '10', '10',
       '10', '10', '10', '0.1870', '0.2250'], dtype=object)

'Silicon Content' is a numerical column though there were strings like values found which are fixed using to_numeric() function.

In [310]:
df1['Silicon Content'] = pd.to_numeric(df1['Silicon Content'], errors='coerce')

In [311]:
df1[["Cluster", "Silicon Content"]].values

array([['WB-G', nan],
       ['WB-G', nan],
       ['WB-U', 0.254],
       ['WB-U', 0.225],
       ['Spaltband', nan],
       ['Spaltband', nan],
       ['ELO verzinkt', 10.0],
       ['WB-G', nan],
       ['WB-U', 0.233],
       ['WB-U', 0.119],
       ['WB-U', 0.0],
       ['WB-U', 0.265],
       ['WB-G', 0.292],
       ['WB-U', 0.252],
       ['WB-U', 0.228],
       ['WB-U', 0.07],
       ['WB-U', 0.22],
       ['WB-U', 0.249],
       ['WB-U', 0.463],
       ['WB-U', 0.447],
       ['WB-U', 0.447],
       ['WB-U', 0.447],
       ['WB-U', 0.447],
       ['WBU', 0.447],
       ['WBU', 0.262],
       ['WB-U', 0.164],
       ['WB-U', 0.258],
       ['WB-U', 0.235],
       ['WB-U', 0.187],
       ['WB-U', 0.187],
       ['WB-U', 0.187],
       ['WB-U', 0.187],
       ['WB-U', 0.245],
       ['WB-U', 0.198],
       ['WB-U', 0.254],
       ['WB-U', 0.249],
       ['WB-U', 0.059],
       ['WB-U', 0.437],
       ['WB-U', 0.437],
       ['WB-U', 0.012],
       ['WB-G', 0.276],
       ['WB-U',

In [314]:
df1["Silicon Content"].isnull().sum()

np.int64(38)

Because Cluster is a generalised feature now, computed and fixed over the first two columns, it will be used to impute values of further numerical columns.

In [315]:
df1['Silicon Content'] = df1.groupby('Cluster')['Silicon Content'].transform(lambda x: x.fillna(x.median()))

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [316]:
df1[df1["Silicon Content"].isnull()]["Cluster"]

4         Spaltband
5         Spaltband
68    Kaltfeinblech
69    Kaltfeinblech
70    Kaltfeinblech
71    Kaltfeinblech
72    Kaltfeinblech
73    Kaltfeinblech
74    Kaltfeinblech
Name: Cluster, dtype: object

There is a pattern between Cluster and Silicon Content and also, it is essential to do so otherwise imputing the values for such a small dataset based on overall median or mean would be misleading. Median is an ideal choice as mean is susceptible to outliers.

In [317]:
# Fill NaNs in 'Si-Gehalt' with the overall median
overall_median = df1['Silicon Content'].median()
df1['Silicon Content'] = df1.groupby('Cluster')['Silicon Content'].transform(lambda x: x.fillna(overall_median))

In [318]:
df1[["Cluster", "Manganese Content"]].values

array([['WB-G', nan],
       ['WB-G', nan],
       ['WB-U', '1.2780'],
       ['WB-U', '1.0630'],
       ['Spaltband', nan],
       ['Spaltband', nan],
       ['ELO verzinkt', 'A'],
       ['WB-G', nan],
       ['WB-U', '1.0290'],
       ['WB-U', '0.7440'],
       ['WB-U', '0'],
       ['WB-U', '1.0490'],
       ['WB-G', '0.3230'],
       ['WB-U', '0.9020'],
       ['WB-U', '1.3560'],
       ['WB-U', '1.1410'],
       ['WB-U', '0.8540'],
       ['WB-U', '1.2820'],
       ['WB-U', '1.8620'],
       ['WB-U', '1.8760'],
       ['WB-U', '1.8760'],
       ['WB-U', '1.8760'],
       ['WB-U', '1.8760'],
       ['WBU', '1.8760'],
       ['WBU', '1.8580'],
       ['WB-U', '0.6700'],
       ['WB-U', '1.8220'],
       ['WB-U', '1.8620'],
       ['WB-U', '0.6840'],
       ['WB-U', '0.6840'],
       ['WB-U', '0.6840'],
       ['WB-U', '0.6840'],
       ['WB-U', '0.4180'],
       ['WB-U', '0.2470'],
       ['WB-U', '1.8220'],
       ['WB-U', '1.8390'],
       ['WB-U', '0.9200'],
       ['WB-U', '1.8

In [319]:
df1[["Cluster", "Yield Strength", "Tensile Strength", "Elongation"]]

Unnamed: 0,Cluster,Yield Strength,Tensile Strength,Elongation
0,WB-G,,,
1,WB-G,,,
2,WB-U,0.000,0.00,0.0
3,WB-U,0.000,0.00,0.0
4,Spaltband,,,
...,...,...,...,...
81,ELO verzinkt,1420.000,456.00,500.0
82,ELO verzinkt,1370.000,457.00,270.0
83,ELO verzinkt,1350.000,459.00,320.0
84,WB-U,1.297,1.38,9.0


Following numerical columns also contains missing values, the generalised column named 'Cluster' will be used to imputer their values based on context.

In [320]:
df1.columns

Index(['Quality of Material', 'Order Grade Text',
       'Nominal Thickness (mm with decimal point)', 'Width', 'Length',
       'Weight (kg)', 'Cluster', 'Silicon Content', 'Manganese Content',
       'Phosphorus Content', 'Sulfur Content', 'Chromium Content',
       'Nickel Content', 'Molybdenum Content', 'Vanadium Content',
       'Copper Content', 'Niobium Content', 'Titanium Content',
       'Aluminum Content', 'Boron Content', 'Yield Strength',
       'Tensile Strength', 'Elongation'],
      dtype='object')

In [321]:
def fill_missing_gehalt(df, columns, cluster_column='Cluster'):
    """
    Fills missing values in specified columns based on the median of each cluster first,
    and if any values are still missing, fills with the overall median of the column.
    
    Parameters:
    - df: The pandas DataFrame
    - columns: List of column names (like ['Si-Gehalt', 'Mn-Gehalt', ...])
    - cluster_column: The column used for grouping (default is 'Cluster')
    """
    for col in columns:

        # actual numeric columns which are non-numeric by mistake, convert them to numeric
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # First, fill NaN values with the median of each group (cluster)
        df[col] = df.groupby(cluster_column)[col].transform(lambda x: x.fillna(x.median()))
        
        # If any NaNs still remain, fill them with the overall median of the column
        overall_median = df[col].median()
        df[col] = df[col].fillna(overall_median)
    
    return df

# List of columns you mentioned
gehalt_columns = ['Manganese Content',
       'Phosphorus Content', 'Sulfur Content', 'Chromium Content',
       'Nickel Content', 'Molybdenum Content', 'Vanadium Content',
       'Copper Content', 'Niobium Content', 'Titanium Content',
       'Aluminum Content', 'Boron Content', 'Yield Strength',
       'Tensile Strength', 'Elongation']

df1[gehalt_columns] = df1[gehalt_columns].apply(lambda col: fill_missing_gehalt(df1, [col.name], cluster_column='Cluster')[col.name], axis=0)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [322]:
df1.isnull().sum()

Quality of Material                          0
Order Grade Text                             0
Nominal Thickness (mm with decimal point)    0
Width                                        0
Length                                       0
Weight (kg)                                  0
Cluster                                      0
Silicon Content                              0
Manganese Content                            0
Phosphorus Content                           0
Sulfur Content                               0
Chromium Content                             0
Nickel Content                               0
Molybdenum Content                           0
Vanadium Content                             0
Copper Content                               0
Niobium Content                              0
Titanium Content                             0
Aluminum Content                             0
Boron Content                                0
Yield Strength                               0
Tensile Stren

Now, the dataset is cleaned. Let's explore it again to see the updated version.

In [323]:
df1.head()

Unnamed: 0,Quality of Material,Order Grade Text,Nominal Thickness (mm with decimal point),Width,Length,Weight (kg),Cluster,Silicon Content,Manganese Content,Phosphorus Content,...,Molybdenum Content,Vanadium Content,Copper Content,Niobium Content,Titanium Content,Aluminum Content,Boron Content,Yield Strength,Tensile Strength,Elongation
0,G2UB5,SZBS800,320,856.0,787.0,16.49,WB-G,0.284,0.365,0.0105,...,0.107,10.0,165.0,9.0,36.0,430.0,1.5,400.0,0.5205,7.0
1,G2UJ5,SZBS800,339,918.0,707.0,17.16,WB-G,0.284,0.365,0.0105,...,0.107,10.0,165.0,9.0,36.0,430.0,1.5,400.0,0.5205,7.0
2,C3318,LICRO 500,452,1839.0,300.0,18.7,WB-U,0.254,1.278,0.008,...,0.009,20.0,290.0,12.0,320.0,320.0,25.0,0.0,0.0,0.0
3,C3U15,S380MC mod. 4,532,1160.0,461.0,22.011,WB-U,0.225,1.063,0.01,...,0.006,30.0,150.0,320.0,989.0,430.0,1.0,0.0,0.0,0.0
4,G3UB5,SZBE800,451,727.2,557.0,14.02,Spaltband,0.284,0.643,0.0105,...,0.107,20.0,165.0,14.0,36.0,430.0,1.5,400.0,0.5205,7.0


### Data Exploration
##### supplier_data_2.xlsx

In [154]:
df2 = pd.read_excel(dataset2_file)
df2.head()

Unnamed: 0,PRODUCT_TYPE,ORDER_ID,SITE,MATERIAL_NAME,MATERIAL_NUMBER,MATERIAL_QUALITY_NORM,SURFACE_COATING,DEFECT_NOTES,NOMINAL_THICKNESS_MM,WIDTH_MM,...,HEIGHT_MM,MASS_MIN_KG,NUMBER_OF_COILS,DELIVERY_EARLIEST,DELIVERY_LATEST,INCO_TERM,BUY_NOW_EUR_PER_TON,MIN/MAX_BID_EUR_PER_TON,CO2_PER_TON_MAX_KG,VALID_UNTIL
0,SHEET,436765,1 company gmbh,S235JR,1.0038,DIN EN 10025,,DEKL-S235JR / D2A EID,11.859,1509,...,,2091,,,,FCA,600.0,,,20/02/2025 11:00
1,SHEET,436754,1 company gmbh,S355MC,1.0976,DIN EN 10149,,,8.057,1011,...,,2411,,,,FCA,600.0,,,20/02/2025 11:00
2,SHEET,436755,1 company gmbh,,1.0976,DIN EN 10149,,DEKL-S355MC / D2A WEH,8.057,1010,...,,2251,,,,FKA,600.0,,,2025-02-20 11:00
3,SHEET,436757,1 company gmbh,S355MC,1.0976,,,DEKL-S355MC / D2A WEH,8.057,1011,...,,2401,,,,FCA,600.0,,,2025-02-20 11:00
4,SHEET,436758,1 company gmbh,S355MC,1.0976,DN EN 10149,,DEKL-S355MC / D2A WEH,8.057,1010,...,,2401,,,,FCA,600.0,,,2025-02-20 11:00


Similarly, to first dataset, this dataset also contains several missing values, with few columns corrupted completely.

In [155]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PRODUCT_TYPE             118 non-null    object 
 1   ORDER_ID                 136 non-null    int64  
 2   SITE                     136 non-null    object 
 3   MATERIAL_NAME            116 non-null    object 
 4   MATERIAL_NUMBER          91 non-null     float64
 5   MATERIAL_QUALITY_NORM    119 non-null    object 
 6   SURFACE_COATING          0 non-null      float64
 7   DEFECT_NOTES             116 non-null    object 
 8   NOMINAL_THICKNESS_MM     136 non-null    float64
 9   WIDTH_MM                 136 non-null    int64  
 10  LENGTH_MM                55 non-null     float64
 11  HEIGHT_MM                0 non-null      float64
 12  MASS_MIN_KG              136 non-null    int64  
 13  NUMBER_OF_COILS          0 non-null      float64
 14  DELIVERY_EARLIEST        0

### Data Cleaning

As I cleaned the dataset 1 column by column, I will approach the task for this dataset in a similar fashion.

In [156]:
df2["PRODUCT_TYPE"].value_counts()

PRODUCT_TYPE
COIL_STRIP     63
SHEET          43
SHET            6
COILS_STRIP     6
Name: count, dtype: int64

Replacing mispelled values.

In [157]:
product_type_mapping = {
    'COIL_STRIP': 'COIL_STRIP',
    'COILS_STRIP': 'COIL_STRIP',
    'SHEET': 'SHEET',
    'SHET': 'SHEET'
}

df2['PRODUCT_TYPE'] = df2['PRODUCT_TYPE'].map(product_type_mapping)
df2["PRODUCT_TYPE"].value_counts()

PRODUCT_TYPE
COIL_STRIP    69
SHEET         49
Name: count, dtype: int64

I found these features correlated as they are having pattern.

In [158]:
df2[["PRODUCT_TYPE", "MATERIAL_NUMBER", "MATERIAL_NAME"]].values

array([['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0976, nan],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0045, 'S355JR'],
       ['SHEET', 1.0976, nan],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0976, 'S355MC'],
       ['COIL_STRIP', 1.0873, nan],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['COIL_STRIP', 1.0873, 'DC06'],
       [nan, 1.0873, 'DC06'],
       ['COIL_STRIP', 1.033, nan],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['COIL_STRIP', 1.0873, 'DC06'],
       [nan, 1.0873, 'DC06'],
       ['SHEET', 1.0982, 'S460MC'],
       [nan, 1.0976, 'S355MC'],
       ['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0038, 'S23SJR'],
       ['SHEET', 1.0045, 'S355JR'],
       ['SHEET', 1.0045, nan],
       ['

In [159]:
df2[["PRODUCT_TYPE", "MATERIAL_NUMBER"]].value_counts()

PRODUCT_TYPE  MATERIAL_NUMBER
COIL_STRIP    1.0943             13
SHEET         1.0038             13
              1.0045             12
              1.0976              8
COIL_STRIP    1.0873              7
SHEET         1.0984              4
              1.0982              3
              1.0980              3
              1.0503              2
COIL_STRIP    1.0306              2
              1.0914              2
SHEET         1.1191              2
COIL_STRIP    1.0226              1
              1.0395              1
              1.0350              1
              1.0330              1
SHEET         1.0060              1
              1.0332              1
Name: count, dtype: int64

Checking appropriate similar columns to see the pattern and hard-encoding the missing values based on maximum / frequest class.

Due to time limitations (sickness), I have used hard-encoding here, I could do in a more generalised function.

In [160]:
df2[df2["MATERIAL_NUMBER"] == 1.0226]["PRODUCT_TYPE"]

96            NaN
119    COIL_STRIP
Name: PRODUCT_TYPE, dtype: object

Based on more frequent class, impute the values accordingly.

In [161]:
df2.loc[(df2["MATERIAL_NUMBER"] == 1.0943) & (df2["PRODUCT_TYPE"].isna()), "PRODUCT_TYPE"] = "COIL_STRIP"
df2.loc[(df2["MATERIAL_NUMBER"] == 1.0038) & (df2["PRODUCT_TYPE"].isna()), "PRODUCT_TYPE"] = "SHEET"
df2.loc[(df2["MATERIAL_NUMBER"] == 1.0976) & (df2["PRODUCT_TYPE"].isna()), "PRODUCT_TYPE"] = "SHEET"
df2.loc[(df2["MATERIAL_NUMBER"] == 1.0873) & (df2["PRODUCT_TYPE"].isna()), "PRODUCT_TYPE"] = "COIL_STRIP"
df2.loc[(df2["MATERIAL_NUMBER"] == 1.0984) & (df2["PRODUCT_TYPE"].isna()), "PRODUCT_TYPE"] = "SHEET"
df2.loc[(df2["MATERIAL_NUMBER"] == 1.0980) & (df2["PRODUCT_TYPE"].isna()), "PRODUCT_TYPE"] = "SHEET"
df2.loc[(df2["MATERIAL_NUMBER"] == 1.0226) & (df2["PRODUCT_TYPE"].isna()), "PRODUCT_TYPE"] = "COIL_STRIP"

In [162]:
df2[df2["PRODUCT_TYPE"].isnull()][["MATERIAL_NUMBER", "MATERIAL_NAME"]]

Unnamed: 0,MATERIAL_NUMBER,MATERIAL_NAME
80,,CR3
95,1.0919,HX220BD
102,1.0917,DX51D
106,,
117,,
122,1.0355,DX53D
128,,DIVERSE


In [163]:
df2[["MATERIAL_NAME", "PRODUCT_TYPE"]].value_counts()

MATERIAL_NAME  PRODUCT_TYPE
HCT780X        COIL_STRIP      12
S235JR         SHEET           11
S355MC         SHEET            8
S355JR         SHEET            8
CR4            COIL_STRIP       8
DC06           COIL_STRIP       7
CR380LA        COIL_STRIP       7
CR180BH        COIL_STRIP       7
S500MC         SHEET            5
CR5            COIL_STRIP       5
S23SJR         SHEET            4
CR210BH        COIL_STRIP       4
S460MC         SHEET            3
S420MC         SHEET            3
C45            SHEET            2
CR3            COIL_STRIP       2
CR270BH        COIL_STRIP       2
C45E           SHEET            2
DX51D          COIL_STRIP       2
HX180BD        COIL_STRIP       2
S35SJR         SHEET            2
CR420LA        COIL_STRIP       1
E335           SHEET            1
HC180B         COIL_STRIP       1
DD11           SHEET            1
DX54D          COIL_STRIP       1
Name: count, dtype: int64

In [164]:
df2[df2["MATERIAL_NAME"] == "DIVERSE"]["PRODUCT_TYPE"]

128    NaN
Name: PRODUCT_TYPE, dtype: object

##### Finding the pattern based on textual similarity between the values and their appropriate features.

idx: 80	: CR3 -> COIL_STRIP

idx: 95: HX220BD -> HC or HX or similar -> COIL_STRIP

idx: 102: DX51D -> COIL_STRIP

idx: 122: DX53D -> DX -> COIL_STRIP

DIVERSE -> Frequent

idx: 106: 117: 128: Freuqent category.

In [165]:
# Replace missing values at specific indices with appropriate categories
df2.loc[80, "PRODUCT_TYPE"] = "COIL_STRIP"
df2.loc[95, "PRODUCT_TYPE"] = "COIL_STRIP"
df2.loc[102, "PRODUCT_TYPE"] = "COIL_STRIP"
df2.loc[122, "PRODUCT_TYPE"] = "COIL_STRIP"

# Specific indices to Frequent category
most_frequent_value = df2["PRODUCT_TYPE"].mode()[0]
df2.loc[106, "PRODUCT_TYPE"] = most_frequent_value
df2.loc[117, "PRODUCT_TYPE"] = most_frequent_value
df2.loc[128, "PRODUCT_TYPE"] = most_frequent_value

# Verify if the missing values are updated correctly
print(df2[df2["PRODUCT_TYPE"].isna()])

Empty DataFrame
Columns: [PRODUCT_TYPE, ORDER_ID, SITE, MATERIAL_NAME, MATERIAL_NUMBER, MATERIAL_QUALITY_NORM, SURFACE_COATING, DEFECT_NOTES, NOMINAL_THICKNESS_MM, WIDTH_MM, LENGTH_MM, HEIGHT_MM, MASS_MIN_KG, NUMBER_OF_COILS, DELIVERY_EARLIEST, DELIVERY_LATEST, INCO_TERM, BUY_NOW_EUR_PER_TON, MIN/MAX_BID_EUR_PER_TON, CO2_PER_TON_MAX_KG, VALID_UNTIL]
Index: []

[0 rows x 21 columns]


In [166]:
df2[["PRODUCT_TYPE", "MATERIAL_NUMBER", "MATERIAL_NAME"]].values

array([['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0976, nan],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0045, 'S355JR'],
       ['SHEET', 1.0976, nan],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0976, 'S355MC'],
       ['COIL_STRIP', 1.0873, nan],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['COIL_STRIP', 1.033, nan],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['COIL_STRIP', 1.0873, 'DC06'],
       ['SHEET', 1.0982, 'S460MC'],
       ['SHEET', 1.0976, 'S355MC'],
       ['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0038, 'S235JR'],
       ['SHEET', 1.0038, 'S23SJR'],
       ['SHEET', 1.0045, 'S355JR'],
       ['SHEET', 1

In [167]:
# Step 1: Fill missing values in 'MATERIAL_NUMBER' by taking the median within each 'PRODUCT_TYPE'
df2["MATERIAL_NUMBER"] = df2.groupby("PRODUCT_TYPE")["MATERIAL_NUMBER"].transform(lambda x: x.fillna(x.median()))

# Step 2: Fill missing values in 'MATERIAL_NAME' by taking the mode (most frequent value) within each 'PRODUCT_TYPE'
df2["MATERIAL_NAME"] = df2.groupby("PRODUCT_TYPE")["MATERIAL_NAME"].transform(lambda x: x.fillna(x.mode()[0]))

Deleting the columns which are completely null.

In [168]:
df2 = df2.drop(columns = ["SURFACE_COATING", "HEIGHT_MM", "NUMBER_OF_COILS", "DELIVERY_EARLIEST", "DELIVERY_LATEST", "CO2_PER_TON_MAX_KG"], axis = 1)

In [204]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PRODUCT_TYPE             136 non-null    object 
 1   ORDER_ID                 136 non-null    int64  
 2   SITE                     136 non-null    object 
 3   MATERIAL_NAME            136 non-null    object 
 4   MATERIAL_NUMBER          136 non-null    float64
 5   MATERIAL_QUALITY_NORM    136 non-null    object 
 6   DEFECT_NOTES             136 non-null    object 
 7   NOMINAL_THICKNESS_MM     136 non-null    float64
 8   WIDTH_MM                 136 non-null    int64  
 9   LENGTH_MM                136 non-null    float64
 10  MASS_MIN_KG              136 non-null    int64  
 11  INCO_TERM                136 non-null    object 
 12  BUY_NOW_EUR_PER_TON      39 non-null     float64
 13  MIN/MAX_BID_EUR_PER_TON  125 non-null    float64
 14  VALID_UNTIL              1

In [184]:
df2["MATERIAL_QUALITY_NORM"].value_counts()

MATERIAL_QUALITY_NORM
VDA 239-100             37
DIN EN 10025            25
DIN EN 10149            17
DIN EN 10338            14
DIN EN 10130             8
DIN EN 10346             6
DN EN 10025              3
DN EN 10149              2
DIN EN 10277:2018-09     2
DIN EN 10111             1
DN EN 10083              1
DN EN 10346              1
DIN EN 10268             1
Name: count, dtype: int64

In [182]:
df2["MATERIAL_QUALITY_NORM"].replace("-", np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2["MATERIAL_QUALITY_NORM"].replace("-", np.nan, inplace=True)


In [183]:
df2[["PRODUCT_TYPE", "MATERIAL_NUMBER", "MATERIAL_NAME", "MATERIAL_QUALITY_NORM"]].values

array([['SHEET', 1.0038, 'S235JR', 'DIN EN 10025'],
       ['SHEET', 1.0976, 'S355MC', 'DIN EN 10149'],
       ['SHEET', 1.0976, 'S235JR', 'DIN EN 10149'],
       ['SHEET', 1.0976, 'S355MC', nan],
       ['SHEET', 1.0976, 'S355MC', 'DN EN 10149'],
       ['SHEET', 1.0976, 'S355MC', 'DIN EN 10149'],
       ['SHEET', 1.0045, 'S355JR', 'DIN EN 10025'],
       ['SHEET', 1.0976, 'S235JR', 'DIN EN 10149'],
       ['SHEET', 1.0976, 'S355MC', 'DIN EN 10149'],
       ['SHEET', 1.0976, 'S355MC', 'DIN EN 10149'],
       ['COIL_STRIP', 1.0873, 'HCT780X', nan],
       ['COIL_STRIP', 1.0873, 'DC06', 'DIN EN 10130'],
       ['COIL_STRIP', 1.0873, 'DC06', 'DIN EN 10130'],
       ['COIL_STRIP', 1.0873, 'DC06', 'DIN EN 10130'],
       ['COIL_STRIP', 1.0873, 'DC06', 'DIN EN 10130'],
       ['COIL_STRIP', 1.033, 'HCT780X', 'DIN EN 10130'],
       ['COIL_STRIP', 1.0873, 'DC06', 'DIN EN 10130'],
       ['COIL_STRIP', 1.0873, 'DC06', 'DIN EN 10130'],
       ['COIL_STRIP', 1.0873, 'DC06', 'DIN EN 10130'],
   

In [181]:
df2[df2["MATERIAL_QUALITY_NORM"].isnull()][["PRODUCT_TYPE", "MATERIAL_NUMBER", "MATERIAL_NAME", "MATERIAL_QUALITY_NORM"]]

Unnamed: 0,PRODUCT_TYPE,MATERIAL_NUMBER,MATERIAL_NAME,MATERIAL_QUALITY_NORM
3,SHEET,1.0976,S355MC,
10,COIL_STRIP,1.0873,HCT780X,
44,COIL_STRIP,1.0943,HCT780X,
56,SHEET,1.1191,C45E,
61,SHEET,1.0984,S500MC,
62,SHEET,1.0982,S460MC,
78,COIL_STRIP,1.0914,CR180BH,
79,COIL_STRIP,1.0306,HCT780X,
87,COIL_STRIP,1.0914,CR210BH,
90,COIL_STRIP,1.0914,CR210BH,


Imputing based on most frequenting values.

In [185]:
# Step 1: Fill missing values in 'MATERIAL_QUALITY_NORM' within 'PRODUCT_TYPE' and 'MATERIAL_NAME'
df2["MATERIAL_QUALITY_NORM"] = df2.groupby(["PRODUCT_TYPE", "MATERIAL_NAME"])["MATERIAL_QUALITY_NORM"].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else "Unknown"))

# Step 2: If any NaN values remain, fill them with the mode of the entire 'MATERIAL_QUALITY_NORM' column
mode_material_quality = df2["MATERIAL_QUALITY_NORM"].mode()[0]  # Mode of the entire column
df2["MATERIAL_QUALITY_NORM"].fillna(mode_material_quality, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2["MATERIAL_QUALITY_NORM"].fillna(mode_material_quality, inplace=True)


In [201]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PRODUCT_TYPE             136 non-null    object 
 1   ORDER_ID                 136 non-null    int64  
 2   SITE                     136 non-null    object 
 3   MATERIAL_NAME            136 non-null    object 
 4   MATERIAL_NUMBER          136 non-null    float64
 5   MATERIAL_QUALITY_NORM    136 non-null    object 
 6   DEFECT_NOTES             136 non-null    object 
 7   NOMINAL_THICKNESS_MM     136 non-null    float64
 8   WIDTH_MM                 136 non-null    int64  
 9   LENGTH_MM                136 non-null    float64
 10  MASS_MIN_KG              136 non-null    int64  
 11  INCO_TERM                136 non-null    object 
 12  BUY_NOW_EUR_PER_TON      39 non-null     float64
 13  MIN/MAX_BID_EUR_PER_TON  125 non-null    float64
 14  VALID_UNTIL              1

In [188]:
df2[df2["DEFECT_NOTES"].isnull()][["PRODUCT_TYPE", "MATERIAL_NUMBER", "MATERIAL_QUALITY_NORM", "DEFECT_NOTES"]]

Unnamed: 0,PRODUCT_TYPE,MATERIAL_NUMBER,MATERIAL_QUALITY_NORM,DEFECT_NOTES
1,SHEET,1.0976,DIN EN 10149,
13,COIL_STRIP,1.0873,DIN EN 10130,
14,COIL_STRIP,1.0873,DIN EN 10130,
16,COIL_STRIP,1.0873,DIN EN 10130,
25,SHEET,1.0038,DIN EN 10025,
27,SHEET,1.0045,DIN EN 10025,
37,COIL_STRIP,1.0943,DIN EN 10338,
41,COIL_STRIP,1.0943,DIN EN 10338,
49,COIL_STRIP,1.0943,DIN EN 10338,
50,COIL_STRIP,1.0943,DIN EN 10338,


In [191]:
df2["DEFECT_NOTES"].value_counts()

DEFECT_NOTES
No Defect                               20
DEKL-S355MC  / D2A WEH                   7
DEKL-S355JR  / D2A KRA                   7
HCT780X+ Z 145 MB OL                     6
DEKL-S500MC  / D2A UUE                   5
                                        ..
CR4GI40/40-U Z 90 MB OL  / D2A DSH       1
CR210BHGI75/75-E Z 160 MC  / D3A DGN     1
CR180BHGI50/50-U Z 100 MB  / D2A SUN     1
CR5GI40/40-U Z 90 MB OL  / D2A SAO       1
phs-uncoated 1500 CR  / D2A KS           1
Name: count, Length: 75, dtype: int64

In [190]:
df2["DEFECT_NOTES"].fillna("No Defect", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2["DEFECT_NOTES"].fillna("No Defect", inplace=True)


In [193]:
df2["LENGTH_MM"].value_counts()

LENGTH_MM
3008.0    12
2506.0     8
2356.0     6
2760.0     5
1974.0     4
3000.0     3
2355.0     2
2256.0     2
2000.0     2
2907.0     1
3702.0     1
2005.0     1
3009.0     1
3007.0     1
2956.0     1
3802.0     1
2605.0     1
2957.0     1
2830.0     1
3108.0     1
Name: count, dtype: int64

In [194]:
df2[df2["LENGTH_MM"].isnull()][["PRODUCT_TYPE", "MATERIAL_NUMBER", "WIDTH_MM", "LENGTH_MM"]]

Unnamed: 0,PRODUCT_TYPE,MATERIAL_NUMBER,WIDTH_MM,LENGTH_MM
10,COIL_STRIP,1.0873,1385,
11,COIL_STRIP,1.0873,1170,
12,COIL_STRIP,1.0873,1581,
13,COIL_STRIP,1.0873,1479,
14,COIL_STRIP,1.0873,1628,
...,...,...,...,...
128,COIL_STRIP,1.0914,1130,
129,COIL_STRIP,1.0395,1282,
130,COIL_STRIP,1.0914,1451,
131,COIL_STRIP,1.0873,1385,


In [295]:
df2[["PRODUCT_TYPE", "MATERIAL_NUMBER", "WIDTH_MM", "LENGTH_MM"]].values[:5]

array([['SHEET', 1.0038, 1509.0, 3008.0],
       ['SHEET', 1.0976, 1011.0, 2355.0],
       ['SHEET', 1.0976, 1010.0, 2356.0],
       ['SHEET', 1.0976, 1011.0, 2356.0],
       ['SHEET', 1.0976, 1010.0, 2356.0]], dtype=object)

In [197]:
most_frequent_length = df2["LENGTH_MM"].mode()[0]
df2["LENGTH_MM"].fillna(most_frequent_length, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2["LENGTH_MM"].fillna(most_frequent_length, inplace=True)


In [199]:
df2["INCO_TERM"].value_counts()

INCO_TERM
FCA    93
FKA    15
Name: count, dtype: int64

I guess FKA is mispelled here, it should be FCA too. C might be mispelled to K.

In [200]:
# Replace 'FKA' with 'FCA'
df2['INCO_TERM'] = df2['INCO_TERM'].replace('FKA', 'FCA')

# Fill NaN values with "Others"
df2['INCO_TERM'] = df2['INCO_TERM'].fillna('Others')

In [203]:
df2["BUY_NOW_EUR_PER_TON"].value_counts()

BUY_NOW_EUR_PER_TON
600.0    21
640.0    18
Name: count, dtype: int64

In [294]:
df2[["PRODUCT_TYPE", "MATERIAL_NUMBER", "BUY_NOW_EUR_PER_TON"]].values[:5]

array([['SHEET', 1.0038, 600.0],
       ['SHEET', 1.0976, 600.0],
       ['SHEET', 1.0976, 600.0],
       ['SHEET', 1.0976, 600.0],
       ['SHEET', 1.0976, 600.0]], dtype=object)

In [206]:
# Fill NaN values with the mean price (600.0 + 640.0) / 2
df2["BUY_NOW_EUR_PER_TON"] = df2["BUY_NOW_EUR_PER_TON"].fillna(df2["BUY_NOW_EUR_PER_TON"].mean())

In [208]:
df2["MIN/MAX_BID_EUR_PER_TON"].value_counts()

MIN/MAX_BID_EUR_PER_TON
520.0    29
490.0    27
550.0    26
610.0    18
620.0    15
570.0    10
Name: count, dtype: int64

In [293]:
df2[["PRODUCT_TYPE", "MATERIAL_NUMBER", "BUY_NOW_EUR_PER_TON", "MIN/MAX_BID_EUR_PER_TON"]].values[:5]

array([['SHEET', 1.0038, 600.0, 520.0],
       ['SHEET', 1.0976, 600.0, 520.0],
       ['SHEET', 1.0976, 600.0, 520.0],
       ['SHEET', 1.0976, 600.0, 520.0],
       ['SHEET', 1.0976, 600.0, 520.0]], dtype=object)

In [210]:
df2['MIN/MAX_BID_EUR_PER_TON'].fillna(df2['MIN/MAX_BID_EUR_PER_TON'].mode()[0], inplace=True)

In [211]:
df2.head()

Unnamed: 0,PRODUCT_TYPE,ORDER_ID,SITE,MATERIAL_NAME,MATERIAL_NUMBER,MATERIAL_QUALITY_NORM,DEFECT_NOTES,NOMINAL_THICKNESS_MM,WIDTH_MM,LENGTH_MM,MASS_MIN_KG,INCO_TERM,BUY_NOW_EUR_PER_TON,MIN/MAX_BID_EUR_PER_TON,VALID_UNTIL
0,SHEET,436765,1 company gmbh,S235JR,1.0038,DIN EN 10025,DEKL-S235JR / D2A EID,11.859,1509,3008.0,2091,FCA,600.0,520.0,20/02/2025 11:00
1,SHEET,436754,1 company gmbh,S355MC,1.0976,DIN EN 10149,No Defect,8.057,1011,2355.0,2411,FCA,600.0,520.0,20/02/2025 11:00
2,SHEET,436755,1 company gmbh,S235JR,1.0976,DIN EN 10149,DEKL-S355MC / D2A WEH,8.057,1010,2356.0,2251,FCA,600.0,520.0,2025-02-20 11:00
3,SHEET,436757,1 company gmbh,S355MC,1.0976,DIN EN 10149,DEKL-S355MC / D2A WEH,8.057,1011,2356.0,2401,FCA,600.0,520.0,2025-02-20 11:00
4,SHEET,436758,1 company gmbh,S355MC,1.0976,DN EN 10149,DEKL-S355MC / D2A WEH,8.057,1010,2356.0,2401,FCA,600.0,520.0,2025-02-20 11:00


### Join the datasets

In [220]:
df1.head().columns

Index(['Quality of Material', 'Order Grade Text',
       'Nominal Thickness (mm with decimal point)', 'Width', 'Length',
       'Weight (kg)', 'Cluster', 'Silicon Content', 'Manganese Content',
       'Phosphorus Content', 'Sulfur Content', 'Chromium Content',
       'Nickel Content', 'Molybdenum Content', 'Vanadium Content',
       'Copper Content', 'Niobium Content', 'Titanium Content',
       'Aluminum Content', 'Boron Content', 'Yield Strength',
       'Tensile Strength', 'Elongation'],
      dtype='object')

In [221]:
df2.head().columns

Index(['PRODUCT_TYPE', 'ORDER_ID', 'SITE', 'MATERIAL_NAME', 'MATERIAL_NUMBER',
       'MATERIAL_QUALITY_NORM', 'DEFECT_NOTES', 'NOMINAL_THICKNESS_MM',
       'WIDTH_MM', 'LENGTH_MM', 'MASS_MIN_KG', 'INCO_TERM',
       'BUY_NOW_EUR_PER_TON', 'MIN/MAX_BID_EUR_PER_TON', 'VALID_UNTIL'],
      dtype='object')

In [222]:
df1.columns.str.strip()

Index(['Quality of Material', 'Order Grade Text',
       'Nominal Thickness (mm with decimal point)', 'Width', 'Length',
       'Weight (kg)', 'Cluster', 'Silicon Content', 'Manganese Content',
       'Phosphorus Content', 'Sulfur Content', 'Chromium Content',
       'Nickel Content', 'Molybdenum Content', 'Vanadium Content',
       'Copper Content', 'Niobium Content', 'Titanium Content',
       'Aluminum Content', 'Boron Content', 'Yield Strength',
       'Tensile Strength', 'Elongation'],
      dtype='object')

In [246]:
df2["WIDTH_MM"]

0      1509.0
1      1011.0
2      1010.0
3      1011.0
4      1010.0
        ...  
131    1385.0
132    1509.0
133    1432.0
134    1507.0
135    1256.0
Name: WIDTH_MM, Length: 136, dtype: float64

In [235]:
df1['Width'] = df1['Width'].replace({',': ''}, regex=True)  # Remove commas
df1['Width'] = pd.to_numeric(df1['Width'], errors='coerce')

In [243]:
df1.columns = df1.columns.str.strip()
df2.columns = df2.columns.str.strip()

df1["Width"] = df1["Width"].astype(np.float16)
df2["Width_MM"] = df2["WIDTH_MM"].astype(np.float16)

In [244]:
df1.dropna(subset=['Nominal Thickness (mm with decimal point)', 'Width'], inplace=True)
df2.dropna(subset=['NOMINAL_THICKNESS_MM', 'WIDTH_MM'], inplace=True)

In [251]:
df1.columns, df2.columns

(Index(['Quality of Material', 'Order Grade Text',
        'Nominal Thickness (mm with decimal point)', 'Width', 'Length',
        'Weight (kg)', 'Cluster', 'Silicon Content', 'Manganese Content',
        'Phosphorus Content', 'Sulfur Content', 'Chromium Content',
        'Nickel Content', 'Molybdenum Content', 'Vanadium Content',
        'Copper Content', 'Niobium Content', 'Titanium Content',
        'Aluminum Content', 'Boron Content', 'Yield Strength',
        'Tensile Strength', 'Elongation'],
       dtype='object'),
 Index(['PRODUCT_TYPE', 'ORDER_ID', 'SITE', 'MATERIAL_NAME', 'MATERIAL_NUMBER',
        'MATERIAL_QUALITY_NORM', 'DEFECT_NOTES', 'NOMINAL_THICKNESS_MM',
        'WIDTH_MM', 'LENGTH_MM', 'MASS_MIN_KG', 'INCO_TERM',
        'BUY_NOW_EUR_PER_TON', 'MIN/MAX_BID_EUR_PER_TON', 'VALID_UNTIL',
        'Width_MM'],
       dtype='object'))

In [252]:
# Selecting and renaming columns for df1
df1_selected = df1[['Order Grade Text', 'Nominal Thickness (mm with decimal point)', 'Width', 'Weight (kg)']]
df1_selected.rename(columns={'Order Grade Text': 'Grade', 
                             'Nominal Thickness (mm with decimal point)': 'Thickness (mm)', 
                             'Width': 'Width (mm)', 
                             'Weight (kg)': 'Weight (kg)'}, inplace=True)

# Selecting and renaming columns for df2
df2_selected = df2[['PRODUCT_TYPE', 'MATERIAL_NAME', 'MATERIAL_NUMBER', 'NOMINAL_THICKNESS_MM', 'WIDTH_MM', 'MASS_MIN_KG', 'BUY_NOW_EUR_PER_TON', 'VALID_UNTIL']]
df2_selected.rename(columns={'MATERIAL_NAME': 'Material Name', 
                             'MATERIAL_NUMBER': 'Material ID', 
                             'NOMINAL_THICKNESS_MM': 'Thickness (mm)', 
                             'WIDTH_MM': 'Width (mm)', 
                             'MASS_MIN_KG': 'Weight (kg)', 
                             'BUY_NOW_EUR_PER_TON': 'Price (EUR/ton)', 
                             'VALID_UNTIL': 'Validity'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_selected.rename(columns={'Order Grade Text': 'Grade',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_selected.rename(columns={'MATERIAL_NAME': 'Material Name',


In [255]:
df2_selected.columns

Index(['PRODUCT_TYPE', 'Material Name', 'Material ID', 'Thickness (mm)',
       'Width (mm)', 'Weight (kg)', 'Price (EUR/ton)', 'Validity'],
      dtype='object')

In [260]:
# Convert columns in df1_selected to float64
df1_selected['Thickness (mm)'] = df1_selected['Thickness (mm)'].astype('float64')
df1_selected['Width (mm)'] = df1_selected['Width (mm)'].astype('float64')
df1_selected['Weight (kg)'] = df1_selected['Weight (kg)'].astype('float64')

# Convert columns in df2_selected to float64
df2_selected['Thickness (mm)'] = df2_selected['Thickness (mm)'].astype('float64')
df2_selected['Width (mm)'] = df2_selected['Width (mm)'].astype('float64')
df2_selected['Weight (kg)'] = df2_selected['Weight (kg)'].astype('float64')

# Check if the data types have been correctly updated
print(df1_selected.dtypes)
print(df2_selected.dtypes)

Grade              object
Thickness (mm)    float64
Width (mm)        float64
Weight (kg)       float64
dtype: object
PRODUCT_TYPE        object
Material Name       object
Material ID        float64
Thickness (mm)     float64
Width (mm)         float64
Weight (kg)        float64
Price (EUR/ton)    float64
Validity            object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_selected['Thickness (mm)'] = df1_selected['Thickness (mm)'].astype('float64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_selected['Width (mm)'] = df1_selected['Width (mm)'].astype('float64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_selected['Weight (kg)'] = df1_selected['Weigh

### Outer Join
##### to merge the datasets based on similar consistent columns.

In [277]:
# Perform the merge
merged_df = pd.merge(df1_selected, df2_selected, 
                     how='outer', 
                     on=['Thickness (mm)', 'Width (mm)', 'Weight (kg)'])

# Check the resulting merged dataframe
print(merged_df.head())

  Grade  Thickness (mm)  Width (mm)  Weight (kg) PRODUCT_TYPE Material Name  \
0   NaN           0.640      1421.0       3080.0   COIL_STRIP           CR3   
1   NaN           0.652      1170.0       7031.0   COIL_STRIP          DC06   
2   NaN           0.656      1450.0       2241.0   COIL_STRIP         DX54D   
3   NaN           0.656      1513.0       5880.0   COIL_STRIP         DX53D   
4   NaN           0.659      1451.0       5882.0   COIL_STRIP       HCT780X   

   Material ID  Price (EUR/ton)          Validity  
0       1.0914       618.461538  2025-02-21 11:00  
1       1.0873       600.000000  2025-02-20 11:00  
2       1.0306       618.461538  2025-02-21 11:00  
3       1.0355       618.461538  2025-02-21 11:00  
4       1.0914       618.461538  2025-02-20 16:00  


### Data Exploration

In [324]:
merged_df.isnull().sum()

Grade              0
Thickness (mm)     0
Width (mm)         0
Weight (kg)        0
PRODUCT_TYPE       0
Material Name      0
Material ID        0
Price (EUR/ton)    0
Validity           0
dtype: int64

In [325]:
merged_df.head(5)

Unnamed: 0,Grade,Thickness (mm),Width (mm),Weight (kg),PRODUCT_TYPE,Material Name,Material ID,Price (EUR/ton),Validity
0,Others,0.64,1421.0,3080.0,COIL_STRIP,CR3,1.0914,618.461538,2025-02-21 11:00
1,Others,0.652,1170.0,7031.0,COIL_STRIP,DC06,1.0873,600.0,2025-02-20 11:00
2,Others,0.656,1450.0,2241.0,COIL_STRIP,DX54D,1.0306,618.461538,2025-02-21 11:00
3,Others,0.656,1513.0,5880.0,COIL_STRIP,DX53D,1.0355,618.461538,2025-02-21 11:00
4,Others,0.659,1451.0,5882.0,COIL_STRIP,HCT780X,1.0914,618.461538,2025-02-20 16:00


In [280]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Grade            86 non-null     object 
 1   Thickness (mm)   222 non-null    float64
 2   Width (mm)       222 non-null    float64
 3   Weight (kg)      222 non-null    float64
 4   PRODUCT_TYPE     136 non-null    object 
 5   Material Name    136 non-null    object 
 6   Material ID      136 non-null    float64
 7   Price (EUR/ton)  136 non-null    float64
 8   Validity         136 non-null    object 
dtypes: float64(5), object(4)
memory usage: 15.7+ KB


In [287]:
merged_df[["Thickness (mm)", "Width (mm)", "Weight (kg)", "PRODUCT_TYPE"]].values[:5]

array([[0.64, 1421.0, 3080.0, 'COIL_STRIP'],
       [0.652, 1170.0, 7031.0, 'COIL_STRIP'],
       [0.656, 1450.0, 2241.0, 'COIL_STRIP'],
       [0.656, 1513.0, 5880.0, 'COIL_STRIP'],
       [0.659, 1451.0, 5882.0, 'COIL_STRIP']], dtype=object)

Again, based on similar values from another column, impputing the values for column 'Grade'.

In [282]:
merged_df['Grade'] = merged_df.groupby('Thickness (mm)')['Grade'].transform(lambda x: x.mode()[0] if x.mode().size > 0 else 'Others')

In [283]:
merged_df["Grade"].value_counts()

Grade
Others             153
42CrMo4              9
HR660Y760T-CP        8
SZBS800              8
DBL 4525.30          7
C2WS8                4
2A Lager             4
50CrMo4              3
CR210BH              2
D7G                  2
58CrV4               2
DP600-MW06           2
S380MC mod. 4        2
LICRO 500            2
51CrV4               2
CS Type B            1
HC460LA              1
CR440Y780T-DP        1
RobuSal®800          1
SZBE800              1
HX180YD              1
SZBS800xpand®50      1
CR330Y590T-DP        1
Grade D6A            1
CR340LA              1
CR300LA              1
G1BX5                1
Name: count, dtype: int64

In [290]:
# Impute missing values for the specified columns by using the most frequent value within each group
columns_to_impute = ['PRODUCT_TYPE', 'Material Name', 'Material ID', 'Price (EUR/ton)', 'Validity']

for column in columns_to_impute:
    # Use transform() to ensure the result has the same index as the original DataFrame
    merged_df[column] = merged_df.groupby(['Thickness (mm)', 'Width (mm)', 'Weight (kg)'])[column].transform(
        lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown') 
    )

# Check the resulting merged dataframe to confirm imputation
print(merged_df.isnull().sum())

Grade              0
Thickness (mm)     0
Width (mm)         0
Weight (kg)        0
PRODUCT_TYPE       0
Material Name      0
Material ID        0
Price (EUR/ton)    0
Validity           0
dtype: int64


In [291]:
merged_df.head()

Unnamed: 0,Grade,Thickness (mm),Width (mm),Weight (kg),PRODUCT_TYPE,Material Name,Material ID,Price (EUR/ton),Validity
0,Others,0.64,1421.0,3080.0,COIL_STRIP,CR3,1.0914,618.461538,2025-02-21 11:00
1,Others,0.652,1170.0,7031.0,COIL_STRIP,DC06,1.0873,600.0,2025-02-20 11:00
2,Others,0.656,1450.0,2241.0,COIL_STRIP,DX54D,1.0306,618.461538,2025-02-21 11:00
3,Others,0.656,1513.0,5880.0,COIL_STRIP,DX53D,1.0355,618.461538,2025-02-21 11:00
4,Others,0.659,1451.0,5882.0,COIL_STRIP,HCT780X,1.0914,618.461538,2025-02-20 16:00


In [None]:
merged_df.to_csv("full_inventory.csv", index = False)     # to save the dataset.