In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
rfq_data = pd.read_csv('./data/rfq.csv')
reference_properties_dataset = pd.read_csv('./data/reference_properties.tsv', sep='\t')

# Task B.1

In [3]:
rfq_data.describe()

Unnamed: 0,grade_suffix,thickness_min,thickness_max,width_min,width_max,length_min,height_min,height_max,weight_min,weight_max,inner_diameter_min,inner_diameter_max,outer_diameter_min,outer_diameter_max,yield_strength_min,yield_strength_max,tensile_strength_min,tensile_strength_max
count,0.0,833.0,834.0,539.0,652.0,131.0,132.0,2.0,393.0,631.0,177.0,178.0,23.0,20.0,36.0,16.0,49.0,51.0
mean,,5.752977,5.96693,963.207421,1276.79908,5176.641221,5278.901515,80.0,113075.5,83448.72,607.559322,609.820225,769.104348,1036.985,308.194444,327.5,440.256122,550.15
std,,15.100137,15.10109,614.141337,774.410463,9920.773435,10405.449859,0.0,1264805.0,998640.8,31.193096,14.90047,1389.261845,1453.795719,87.130057,36.055513,135.127971,158.594822
min,,0.01,0.11,2.9,3.0,50.0,100.0,80.0,20.0,1000.0,260.0,508.0,20.0,193.7,220.0,280.0,2.55,2.65
25%,,1.2,1.3,600.0,1103.75,3000.0,3000.0,80.0,10000.0,20500.0,610.0,610.0,401.85,610.0,260.0,300.0,350.0,420.0
50%,,2.0,2.0,1000.0,1500.0,3000.0,3000.0,80.0,20000.0,25000.0,610.0,610.0,610.0,610.0,260.0,330.0,450.0,560.0
75%,,4.0,4.0,1500.0,1520.0,6000.0,6000.0,80.0,40000.0,25000.0,610.0,610.0,610.0,610.0,355.0,350.0,540.0,640.0
max,,190.0,190.0,3700.0,8000.0,114000.0,120000.0,80.0,25000000.0,25000000.0,760.0,760.0,7016.0,7016.0,700.0,380.0,760.0,965.0


In [4]:
rfq_data.isna().sum()

id                         0
grade                     59
grade_suffix            1000
coating                  682
finish                   335
surface_type             865
surface_protection       713
form                      73
thickness_min            167
thickness_max            166
width_min                461
width_max                348
length_min               869
height_min               868
height_max               998
weight_min               607
weight_max               369
inner_diameter_min       823
inner_diameter_max       822
outer_diameter_min       977
outer_diameter_max       980
yield_strength_min       964
yield_strength_max       984
tensile_strength_min     951
tensile_strength_max     949
dtype: int64

In [5]:
rfq_data.drop(columns=['grade_suffix'], inplace=True)

In [6]:
rfq_data['grade'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rfq_data['grade'].fillna('Unknown', inplace=True)


In [7]:
def parse_range_string(value):
    """
    Parse various range string formats into min, max, mid values.
    
    Handles formats like:
    - "2.5-4.0" -> min=2.5, max=4.0, mid=3.25
    - "≤0.05" -> min=0, max=0.05, mid=0.025
    - "≥1000" -> min=1000, max=inf, mid=1000
    - ">500" -> min=500, max=inf, mid=500
    - "<0.1" -> min=0, max=0.1, mid=0.05
    - "1.5" -> min=1.5, max=1.5, mid=1.5
    - "0.02-0.08" -> min=0.02, max=0.08, mid=0.05
    - "max 300" -> min=0, max=300, mid=150
    - "min 50" -> min=50, max=inf, mid=50
    - "Balance" or "Bal" -> min=nan, max=nan, mid=nan
    
    Returns:
    dict with 'min', 'max', 'mid' keys
    """
    
    if pd.isna(value) or value == '' or str(value).strip() == '':
        return {'min': np.nan, 'max': np.nan, 'mid': np.nan}
    
    value = str(value).strip().replace(' ', '').replace(',', '.')
    
    # Handle special cases
    if value.lower() in ['balance', 'bal', 'remainder', 'rem']:
        return {'min': np.nan, 'max': np.nan, 'mid': np.nan}
    
    # Try to extract numbers from the string
    numbers = re.findall(r'\d+\.?\d*', value)
    
    # Convert to floats
    try:
        nums = [float(n) for n in numbers]
    except:
        return {'min': np.nan, 'max': np.nan, 'mid': np.nan}
    
    if not nums:
        return {'min': np.nan, 'max': np.nan, 'mid': np.nan}
    
    # Pattern matching for different formats
    
    # Range with dash: "2.5-4.0"
    if '-' in value and len(nums) >= 2:
        min_val, max_val = sorted([nums[0], nums[1]])
        mid_val = (min_val + max_val) / 2
        return {'min': min_val, 'max': max_val, 'mid': mid_val}
    
    # Less than or equal: "≤0.05", "<=0.05", "max 0.05"
    elif any(symbol in value for symbol in ['≤', '<=', 'max']) and len(nums) >= 1:
        max_val = nums[0]
        min_val = 0.0
        mid_val = max_val / 2
        return {'min': min_val, 'max': max_val, 'mid': mid_val}
    
    # Greater than or equal: "≥1000", ">=1000", "min 1000"
    elif any(symbol in value for symbol in ['≥', '>=', 'min']) and len(nums) >= 1:
        min_val = nums[0]
        max_val = np.inf
        mid_val = min_val
        return {'min': min_val, 'max': max_val, 'mid': mid_val}
    
    # Less than: "<0.1"
    elif '<' in value and '=' not in value and len(nums) >= 1:
        max_val = nums[0]
        min_val = 0.0
        mid_val = max_val / 2
        return {'min': min_val, 'max': max_val, 'mid': mid_val}
    
    # Greater than: ">500"
    elif '>' in value and '=' not in value and len(nums) >= 1:
        min_val = nums[0]
        max_val = np.inf
        mid_val = min_val
        return {'min': min_val, 'max': max_val, 'mid': mid_val}
    
    # Single number: "1.5"
    elif len(nums) == 1:
        val = nums[0]
        return {'min': val, 'max': val, 'mid': val}
    
    # Multiple numbers without clear pattern - take range
    elif len(nums) >= 2:
        min_val, max_val = min(nums), max(nums)
        mid_val = (min_val + max_val) / 2
        return {'min': min_val, 'max': max_val, 'mid': mid_val}
    
    # Fallback
    return {'min': np.nan, 'max': np.nan, 'mid': np.nan}

In [8]:
def parse_dataset_ranges(df, range_columns=None, include_mid=True):
    """
    Parse range strings in specified columns of the dataset.
    
    Parameters:
    df: DataFrame
    range_columns: list of column names to parse, or None for auto-detect
    include_mid: whether to create mid-point columns
    
    Returns:
    DataFrame with additional min/max (and optionally mid) columns
    """
    
    df_result = df.copy()
    
    # Auto-detect range columns if not specified
    if range_columns is None:
        # Look for columns that might contain ranges
        range_columns = []
        for col in df.columns:
            if df[col].dtype == 'object':  # String columns
                sample_values = df[col].dropna().head(10).astype(str)
                # Check if any values look like ranges
                if any(any(char in str(val) for char in ['-', '≤', '≥', '<', '>']) 
                       for val in sample_values):
                    range_columns.append(col)
        
        print(f"Auto-detected range columns: {range_columns}")
    
    # Parse each range column
    for col in range_columns:
        if col not in df.columns:
            print(f"Warning: Column '{col}' not found in dataset")
            continue
        
        print(f"Parsing column: {col}")
        
        # Parse all values in the column
        parsed_data = df[col].apply(parse_range_string)
        
        # Extract min, max, mid values
        df_result[f"{col}_min"] = [d['min'] for d in parsed_data]
        df_result[f"{col}_max"] = [d['max'] for d in parsed_data]
        
        if include_mid:
            df_result[f"{col}_mid"] = [d['mid'] for d in parsed_data]
        
        # Show parsing results
        print(f"  Parsed {col}:")
        print(f"    Original sample: {df[col].dropna().head(3).tolist()}")
        print(f"    Min sample: {df_result[f'{col}_min'].dropna().head(3).tolist()}")
        print(f"    Max sample: {df_result[f'{col}_max'].dropna().head(3).tolist()}")
        if include_mid:
            print(f"    Mid sample: {df_result[f'{col}_mid'].dropna().head(3).tolist()}")
        print()
    
    return df_result

In [18]:
chemistry_columns = [
            'Carbon (C)', 'Manganese (Mn)', 'Silicon (Si)', 
            'Sulfur (S)', 'Phosphorus (P)', 'Chromium (Cr)', 
            'Nickel (Ni)', 'Molybdenum (Mo)', 'Aluminum (Al)',
            'Titanium (Ti)', 'Vanadium (V)'
        ]
        
        # Other potential range columns
other_range_columns = [
            'Tensile strength (Rm)', 'Yield strength (Re or Rp0.2)',
            'Elongation (A%)', 'Hardness (HB, HV, HRC)'
        ]
        
        # Combine and filter to existing columns
potential_columns = chemistry_columns + other_range_columns
existing_columns = [col for col in potential_columns if col in reference_properties_dataset.columns]
        
print(f"Found potential range columns: {existing_columns}")
        
# Parse the ranges
reference_properties_dataset_parsed = parse_dataset_ranges(reference_properties_dataset, existing_columns, include_mid=True)        
new_columns = [col for col in reference_properties_dataset_parsed.columns if col not in reference_properties_dataset.columns]


Found potential range columns: ['Carbon (C)', 'Manganese (Mn)', 'Silicon (Si)', 'Sulfur (S)', 'Phosphorus (P)', 'Chromium (Cr)', 'Nickel (Ni)', 'Molybdenum (Mo)', 'Aluminum (Al)', 'Titanium (Ti)', 'Vanadium (V)', 'Tensile strength (Rm)', 'Yield strength (Re or Rp0.2)', 'Elongation (A%)', 'Hardness (HB, HV, HRC)']
Parsing column: Carbon (C)
  Parsed Carbon (C):
    Original sample: ['≤0.17', '≤0.21', '≤0.24']
    Min sample: [0.0, 0.0, 0.0]
    Max sample: [0.17, 0.21, 0.24]
    Mid sample: [0.085, 0.105, 0.12]

Parsing column: Manganese (Mn)
  Parsed Manganese (Mn):
    Original sample: ['≤1.40', '≤1.50', '≤1.60']
    Min sample: [0.0, 0.0, 0.0]
    Max sample: [1.4, 1.5, 1.6]
    Mid sample: [0.7, 0.75, 0.8]

Parsing column: Silicon (Si)
  Parsed Silicon (Si):
    Original sample: ['≤0.40', '≤0.40', '≤0.55']
    Min sample: [0.0, 0.0, 0.0]
    Max sample: [0.4, 0.4, 0.55]
    Mid sample: [0.2, 0.2, 0.275]

Parsing column: Sulfur (S)
  Parsed Sulfur (S):
    Original sample: ['≤0.035',

In [None]:
reference_properties_dataset[new_columns] = 

['Carbon (C)_min',
 'Carbon (C)_max',
 'Carbon (C)_mid',
 'Manganese (Mn)_min',
 'Manganese (Mn)_max',
 'Manganese (Mn)_mid',
 'Silicon (Si)_min',
 'Silicon (Si)_max',
 'Silicon (Si)_mid',
 'Sulfur (S)_min',
 'Sulfur (S)_max',
 'Sulfur (S)_mid',
 'Phosphorus (P)_min',
 'Phosphorus (P)_max',
 'Phosphorus (P)_mid',
 'Chromium (Cr)_min',
 'Chromium (Cr)_max',
 'Chromium (Cr)_mid',
 'Nickel (Ni)_min',
 'Nickel (Ni)_max',
 'Nickel (Ni)_mid',
 'Molybdenum (Mo)_min',
 'Molybdenum (Mo)_max',
 'Molybdenum (Mo)_mid',
 'Aluminum (Al)_min',
 'Aluminum (Al)_max',
 'Aluminum (Al)_mid',
 'Titanium (Ti)_min',
 'Titanium (Ti)_max',
 'Titanium (Ti)_mid',
 'Vanadium (V)_min',
 'Vanadium (V)_max',
 'Vanadium (V)_mid',
 'Tensile strength (Rm)_min',
 'Tensile strength (Rm)_max',
 'Tensile strength (Rm)_mid',
 'Yield strength (Re or Rp0.2)_min',
 'Yield strength (Re or Rp0.2)_max',
 'Yield strength (Re or Rp0.2)_mid',
 'Elongation (A%)_min',
 'Elongation (A%)_max',
 'Elongation (A%)_mid',
 'Hardness (HB, HV,

In [20]:
rfq_data['grade'].apply(lambda x: x.upper())

0      S700MC
1      S250GD
2       DX51D
3        S235
4        S235
        ...  
995     DX51D
996      DD11
997    S550MC
998      DD11
999      DD12
Name: grade, Length: 1000, dtype: object

In [23]:
joined_dataset = rfq_data.join(reference_properties_dataset_parsed.set_index('Grade/Material'), on='grade', how='left', rsuffix='_ref')

In [24]:
joined_dataset['Aluminum (Al)_mid']

0      0.015
1      0.010
2      0.010
3        NaN
4        NaN
       ...  
995    0.010
996      NaN
997    0.015
998      NaN
999      NaN
Name: Aluminum (Al)_mid, Length: 1000, dtype: float64

# Task B.2

In [None]:
for col in joined_dataset.columns:
    if col.endswith('_min') or col.endswith('_max'):
        base = col.replace('_min', '').replace('_max', '')
        min_col = f"{base}_min"
        max_col = f"{base}_max"
        if min_col in joined_dataset.columns and max_col in joined_dataset.columns:
            min_vals = joined_dataset[min_col].copy()
            max_vals = joined_dataset[max_col].copy()
        elif min_col in joined_dataset.columns:
            min_vals = joined_dataset[min_col].copy()
            max_vals = joined_dataset[min_col].copy()
        elif max_col in joined_dataset.columns:
            min_vals = joined_dataset[max_col].copy()
            max_vals = joined_dataset[max_col].copy()
        else:
            continue
        # Fill NaNs in both arrays at the same positions
        mask = min_vals.isna() | max_vals.isna()
        min_vals = min_vals.fillna(-1)
        max_vals = max_vals.fillna(-1)
        min_vals[mask] = -1
        max_vals[mask] = -1
        # Ensure min <= max
        swapped = min_vals > max_vals
        min_vals[swapped], max_vals[swapped] = max_vals[swapped], min_vals[swapped]
        interval_col = f"{base}_interval"
        joined_dataset[interval_col] = pd.IntervalIndex.from_arrays(min_vals, max_vals, closed='both')

In [None]:
joined_dataset['inner_diameter_interval'].dtype

interval[float64, both]

In [26]:
joined_dataset['coating'].value_counts()

coating
Z275       40
Z100       33
ZE25/25    27
Z140       25
GI50/50    19
           ..
ZE20/20     1
AZ          1
ZM060       1
ZF          1
Z150        1
Name: count, Length: 62, dtype: int64

In [40]:
joined_dataset['surface_type'].isna().sum()

865

Unnamed: 0,Grade/Material,UNS_No,Steel_No,Standards,Carbon (C),Manganese (Mn),Silicon (Si),Sulfur (S),Phosphorus (P),Chromium (Cr),...,Reduction of area (Z%),"Hardness (HB, HV, HRC)",Impact toughness (Charpy V-notch),Fatigue limit,Creep resistance,Source_Pages,Application,Category,Nb + V + Ti (Others),Coating
0,S235JR,,,EN 10025-2:2019,≤0.17,≤1.40,≤0.40,≤0.035,≤0.035,,...,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
1,S275JR,,,EN 10025-2:2019,≤0.21,≤1.50,≤0.40,≤0.035,≤0.035,,...,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
2,S355JR,,,EN 10025-2:2019,≤0.24,≤1.60,≤0.55,≤0.035,≤0.035,,...,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
3,S420M,,,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,≤0.30,...,,,27J at -20°C,,,Standard Specifications,Thermomechanically rolled steels,High Strength Steel,,
4,S460M,,,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,≤0.30,...,,,27J at -20°C,,,Standard Specifications,Thermomechanically rolled steels,High Strength Steel,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,HX300LAD,,,Automotive specification,≤0.12,0.60-1.00,≤0.50,≤0.015,≤0.025,,...,,,,,,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel,,
171,HX340LAD,,,Automotive specification,≤0.12,0.70-1.20,≤0.50,≤0.015,≤0.025,,...,,,,,,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel,,
172,HX380,,,Automotive specification,≤0.12,0.70-1.40,≤0.50,≤0.015,≤0.025,,...,,,,,,Standard Specifications,Automotive high strength steel,HSLA Steel,,
173,HX420,,,Automotive specification,≤0.12,0.70-1.60,≤0.50,≤0.015,≤0.025,,...,,,,,,Standard Specifications,Automotive high strength steel,HSLA Steel,,
