# Scenario B

In [1]:
import pandas as pd
import numpy as np

In [2]:
rfq_data = pd.read_csv('./data/rfq.csv')
reference_prop_data = pd.read_csv('./data/reference_properties.tsv', sep='\t')

In [3]:
rfq_data.isna().sum()

id                         0
grade                     59
grade_suffix            1000
coating                  682
finish                   335
surface_type             865
surface_protection       713
form                      73
thickness_min            167
thickness_max            166
width_min                461
width_max                348
length_min               869
height_min               868
height_max               998
weight_min               607
weight_max               369
inner_diameter_min       823
inner_diameter_max       822
outer_diameter_min       977
outer_diameter_max       980
yield_strength_min       964
yield_strength_max       984
tensile_strength_min     951
tensile_strength_max     949
dtype: int64

In [4]:
reference_prop_data.isna().sum()

Grade/Material                         0
UNS_No                               175
Steel_No                             175
Standards                              0
Carbon (C)                             0
Manganese (Mn)                         0
Silicon (Si)                          41
Sulfur (S)                            12
Phosphorus (P)                        12
Chromium (Cr)                        133
Nickel (Ni)                          159
Molybdenum (Mo)                      145
Vanadium (V)                         150
Tungsten (W)                         175
Cobalt (Co)                          175
Copper (Cu)                          174
Aluminum (Al)                         81
Titanium (Ti)                        146
Niobium (Nb)                         149
Boron (B)                            167
Nitrogen (N)                         158
Tensile strength (Rm)                  0
Yield strength (Re or Rp0.2)          11
Elongation (A%)                       21
Reduction of are

In [5]:
rfq_drop_cols = [col for col in rfq_data.columns if rfq_data[col].isna().sum() > 0.6 * len(rfq_data)]
reference_prop_data_drop_cols = [col for col in reference_prop_data.columns if reference_prop_data[col].isna().sum() > 0.6 * len(reference_prop_data)]

rfq_data_cleaned = rfq_data.drop(columns=rfq_drop_cols)
reference_prop_data_cleaned = reference_prop_data.drop(columns=reference_prop_data_drop_cols)

### Task B.1

In [6]:
# Capitalize first letter of each column name

rfq_data_cleaned.columns = rfq_data_cleaned.columns.str.capitalize()
reference_prop_data_cleaned.columns = reference_prop_data_cleaned.columns.str.capitalize()

- After capitilizing every first letter of each column in both datasets to maintain a consistent naming standard, we will be renaming the `Grade/material` column to `Grade` for joining both of the tables. 

In [7]:
# Rename 'Grade/material' to 'Grade' for easier merging

reference_prop_data_cleaned.rename(columns={'Grade/material': 'Grade'}, inplace=True)

In [8]:
reference_prop_data_cleaned

Unnamed: 0,Grade,Standards,Carbon (c),Manganese (mn),Silicon (si),Sulfur (s),Phosphorus (p),Aluminum (al),Tensile strength (rm),Yield strength (re or rp0.2),Elongation (a%),Source_pages,Application,Category
0,S235JR,EN 10025-2:2019,≤0.17,≤1.40,≤0.40,≤0.035,≤0.035,,360-510 MPa,≥235 MPa,≥26%,Standard Specifications,General structural steels,Structural Steel
1,S275JR,EN 10025-2:2019,≤0.21,≤1.50,≤0.40,≤0.035,≤0.035,,410-560 MPa,≥275 MPa,≥23%,Standard Specifications,General structural steels,Structural Steel
2,S355JR,EN 10025-2:2019,≤0.24,≤1.60,≤0.55,≤0.035,≤0.035,,470-630 MPa,≥355 MPa,≥22%,Standard Specifications,General structural steels,Structural Steel
3,S420M,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,,520-680 MPa,≥420 MPa,≥19%,Standard Specifications,Thermomechanically rolled steels,High Strength Steel
4,S460M,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,,540-720 MPa,≥460 MPa,≥17%,Standard Specifications,Thermomechanically rolled steels,High Strength Steel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,HX300LAD,Automotive specification,≤0.12,0.60-1.00,≤0.50,≤0.015,≤0.025,0.015-0.070,380-480 MPa,≥300 MPa,≥27%,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel
171,HX340LAD,Automotive specification,≤0.12,0.70-1.20,≤0.50,≤0.015,≤0.025,0.015-0.070,410-510 MPa,≥340 MPa,≥26%,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel
172,HX380,Automotive specification,≤0.12,0.70-1.40,≤0.50,≤0.015,≤0.025,0.015-0.070,450-580 MPa,≥380 MPa,≥23%,Standard Specifications,Automotive high strength steel,HSLA Steel
173,HX420,Automotive specification,≤0.12,0.70-1.60,≤0.50,≤0.015,≤0.025,0.015-0.070,480-620 MPa,≥420 MPa,≥21%,Standard Specifications,Automotive high strength steel,HSLA Steel


In [9]:
# Setting all the values in Grade column for both datasets to uppercase to maintain consistency

rfq_data_cleaned['Grade'] = rfq_data_cleaned['Grade'].str.upper()
reference_prop_data_cleaned['Grade'] = reference_prop_data_cleaned['Grade'].str.upper()

In [10]:
# Fill NaN values in Grade column in rfq dataset with 'Unknown'

rfq_data_cleaned['Grade'] = rfq_data_cleaned['Grade'].fillna('Unknown')

In [11]:
reference_prop_data_cleaned

Unnamed: 0,Grade,Standards,Carbon (c),Manganese (mn),Silicon (si),Sulfur (s),Phosphorus (p),Aluminum (al),Tensile strength (rm),Yield strength (re or rp0.2),Elongation (a%),Source_pages,Application,Category
0,S235JR,EN 10025-2:2019,≤0.17,≤1.40,≤0.40,≤0.035,≤0.035,,360-510 MPa,≥235 MPa,≥26%,Standard Specifications,General structural steels,Structural Steel
1,S275JR,EN 10025-2:2019,≤0.21,≤1.50,≤0.40,≤0.035,≤0.035,,410-560 MPa,≥275 MPa,≥23%,Standard Specifications,General structural steels,Structural Steel
2,S355JR,EN 10025-2:2019,≤0.24,≤1.60,≤0.55,≤0.035,≤0.035,,470-630 MPa,≥355 MPa,≥22%,Standard Specifications,General structural steels,Structural Steel
3,S420M,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,,520-680 MPa,≥420 MPa,≥19%,Standard Specifications,Thermomechanically rolled steels,High Strength Steel
4,S460M,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,,540-720 MPa,≥460 MPa,≥17%,Standard Specifications,Thermomechanically rolled steels,High Strength Steel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,HX300LAD,Automotive specification,≤0.12,0.60-1.00,≤0.50,≤0.015,≤0.025,0.015-0.070,380-480 MPa,≥300 MPa,≥27%,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel
171,HX340LAD,Automotive specification,≤0.12,0.70-1.20,≤0.50,≤0.015,≤0.025,0.015-0.070,410-510 MPa,≥340 MPa,≥26%,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel
172,HX380,Automotive specification,≤0.12,0.70-1.40,≤0.50,≤0.015,≤0.025,0.015-0.070,450-580 MPa,≥380 MPa,≥23%,Standard Specifications,Automotive high strength steel,HSLA Steel
173,HX420,Automotive specification,≤0.12,0.70-1.60,≤0.50,≤0.015,≤0.025,0.015-0.070,480-620 MPa,≥420 MPa,≥21%,Standard Specifications,Automotive high strength steel,HSLA Steel


- Now, we will be parsing range strings into numeric min, max and mid values.

In [12]:
# Declaring the columns which have range values in reference properties dataset

elements_cols = [
    'Carbon (c)',
    'Manganese (mn)',
    'Silicon (si)',
    'Sulfur (s)',
    'Phosphorus (p)',
    'Aluminum (al)',
    'Elongation (a%)'
]
range_symbols = ['-', '≤', '<', '>', '≥', '%']

In [13]:
reference_prop_data_cleaned[elements_cols] = reference_prop_data_cleaned[elements_cols].fillna('Unknown')

In [14]:
# Creating min, max and mid columns for each of the elements columns

def create_min_max_mid(value: str):
    if any(element in value for element in range_symbols) and value != 'Unknown':
        if '-' in value:
            min_val, max_val = value.replace("MPa", '').replace('%', '').split('-')
            return float(min_val), float(max_val), (float(min_val) + float(max_val)) / 2
        elif '≤' in value or '<' in value:
            if '%' in value:
                value = value.replace('%', '')
                max_val = float(value.replace('≤', '').replace('<', ''))
                return None, max_val / 100, None
            else:
                value = value.replace("MPa", '').replace('≤', '').replace('<', '')
                max_val = float(value)
                return None, max_val, None
        elif '≥' in value or '>' in value:
            if '%' in value:
                value = value.replace('%', '')
                min_val = float(value.replace('≥', '').replace('>', ''))
                return min_val / 100, None, None
            else:
                value = value.replace("MPa", '').replace('≥', '').replace('>', '')
                min_val = float(value)
                return min_val, None, None
    else:
        return None, None, None
        
def create_min_max_mid_columns(df, cols):
    for col in cols:
        min_col = f'{col}_min'
        max_col = f'{col}_max'
        mid_col = f'{col}_mid'
        
        df[[min_col, max_col, mid_col]] = df[col].apply(lambda x: pd.Series(create_min_max_mid(x)))
    return df


In [15]:
# Creating min, max and mid columns for each of the elements columns in reference properties dataset

reference_prop_data_cleaned = create_min_max_mid_columns(reference_prop_data_cleaned, elements_cols)
reference_prop_data_cleaned.drop(columns=elements_cols, inplace=True)

- Joining both of the datasets with the `Grade` column.

In [None]:
# Merging both of the datasets with the `Grade` column.

merged_data = reference_prop_data_cleaned.merge(
    rfq_data_cleaned, 
    on='Grade', 
    how='right', 
    suffixes=('_ref', '_rfq')
)

In [18]:
merged_data.reset_index()

Unnamed: 0,index,Grade,Standards,Tensile strength (rm),Yield strength (re or rp0.2),Source_pages,Application,Category,Carbon (c)_min,Carbon (c)_max,...,Elongation (a%)_max,Elongation (a%)_mid,Id,Finish,Form,Thickness_min,Thickness_max,Width_min,Width_max,Weight_max
0,0,S700MC,EN 10149-2:2013,750-950 MPa,≥700 MPa,Standard Specifications,"Cold forming, automotive, high strength applic...",Microalloyed Steel,,0.12,...,,,8aff426d-b8c0-43aa-ad26-835ef4de6129,Oiled,Coils,6.00,6.0,600.0,1520.0,25000.0
1,1,S250GD,EN 10346:2015,330-510 MPa,≥250 MPa,Standard Specifications,Structural galvanized steel,Galvanized Steel,,0.25,...,,,37e624be-b125-464f-85b6-1838530193ef,Hot-dip zinc magnesium (+ZM),Slit Coils,1.50,1.5,327.0,327.0,
2,2,DX51D,EN 10346:2015,270-500 MPa,≤350 MPa,Standard Specifications,Galvanized steel for forming,Galvanized Steel,,0.12,...,,,b8257184-6307-46ab-b06e-d979336d1263,Hot-dip Galvanized (+Z/+GI),Coils,0.40,0.4,1000.0,1500.0,
3,3,S235,EN 10025-2:2019,360-510 MPa,≥235 MPa,Standard Specifications,General structural steel,Structural Steel,,0.17,...,,,63140d1f-dda8-40fe-8931-bcaba65d5772,,Round Tubes,1.50,1.5,,,53800.0
4,4,S235,EN 10025-2:2019,360-510 MPa,≥235 MPa,Standard Specifications,General structural steel,Structural Steel,,0.17,...,,,11cffc57-44be-4d79-bfd5-97482be566d3,,Round Tubes,1.50,1.5,,,14500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1000,DX51D,EN 10346:2015,270-500 MPa,≤350 MPa,Standard Specifications,Galvanized steel for forming,Galvanized Steel,,0.12,...,,,7a965ba4-be4e-4833-a723-1f98ed1f24ac,Hot-dip Galvanized (+Z/+GI),Coils,0.55,0.6,,,
1001,1001,DD11,EN 10111:2008,270-410 MPa,≤380 MPa,Standard Specifications,General cold forming,Cold Rolled Steel,,0.12,...,,,82fc492f-0adc-49bd-8458-69bb1f0f318e,Hot Rolled,Coils,,,,1520.0,25000.0
1002,1002,S550MC,EN 10149-2:2013,600-760 MPa,≥550 MPa,Standard Specifications,Cold forming microalloyed steel,Microalloyed Steel,,0.12,...,,,d3db6f2f-6218-4612-8a63-c26d4ac69658,Hot Rolled,Coils,3.40,3.6,,1520.0,35000.0
1003,1003,DD11,EN 10111:2008,270-410 MPa,≤380 MPa,Standard Specifications,General cold forming,Cold Rolled Steel,,0.12,...,,,90b02a62-3da9-4648-948c-52627d086a50,Hot Rolled,Coils,2.40,2.6,600.0,1520.0,25000.0


In [None]:
# Dropping columns with more than 60% NaN values

merged_data_drop_cols = [col for col in merged_data.columns if merged_data[col].isna().sum() > 0.6 * len(merged_data)]
merged_data_cleaned = merged_data.drop(columns=merged_data_drop_cols)

In [20]:
merged_data_obj_cols = merged_data.select_dtypes(include=['object']).columns
merged_data_num_cols = merged_data.select_dtypes(include=['number']).columns

In [None]:
# Imputing missing values in numerical columns with mean and categorical columns with 'Unknown'

merged_data[merged_data_num_cols] = merged_data[merged_data_num_cols].fillna(merged_data[merged_data_num_cols].mean())
merged_data[merged_data_obj_cols] = merged_data[merged_data_obj_cols].fillna('Unknown')
merged_data.drop_duplicates(inplace=True)

### Task B.2