# Scenario B

In [1]:
# Importing the libraries

import pandas as pd
import numpy as np

In [2]:
# Reading in the data

rfq_data = pd.read_csv('./data/rfq.csv')
reference_prop_data = pd.read_csv('./data/reference_properties.tsv', sep='\t')

In [3]:
# Checking for missing values

rfq_data.isna().sum()

id                         0
grade                     59
grade_suffix            1000
coating                  682
finish                   335
surface_type             865
surface_protection       713
form                      73
thickness_min            167
thickness_max            166
width_min                461
width_max                348
length_min               869
height_min               868
height_max               998
weight_min               607
weight_max               369
inner_diameter_min       823
inner_diameter_max       822
outer_diameter_min       977
outer_diameter_max       980
yield_strength_min       964
yield_strength_max       984
tensile_strength_min     951
tensile_strength_max     949
dtype: int64

In [4]:
reference_prop_data.isna().sum()

Grade/Material                         0
UNS_No                               175
Steel_No                             175
Standards                              0
Carbon (C)                             0
Manganese (Mn)                         0
Silicon (Si)                          41
Sulfur (S)                            12
Phosphorus (P)                        12
Chromium (Cr)                        133
Nickel (Ni)                          159
Molybdenum (Mo)                      145
Vanadium (V)                         150
Tungsten (W)                         175
Cobalt (Co)                          175
Copper (Cu)                          174
Aluminum (Al)                         81
Titanium (Ti)                        146
Niobium (Nb)                         149
Boron (B)                            167
Nitrogen (N)                         158
Tensile strength (Rm)                  0
Yield strength (Re or Rp0.2)          11
Elongation (A%)                       21
Reduction of are

- We will be dropping all columns with more than 70% missing values, as we feel that their contribution would be negligible.

In [5]:
# Dropping all columns in both of the datasets with more than 70% missing values

rfq_drop_cols = [col for col in rfq_data.columns if rfq_data[col].isna().sum() > 0.7 * len(rfq_data)]
reference_prop_data_drop_cols = [col for col in reference_prop_data.columns if reference_prop_data[col].isna().sum() > 0.7 * len(reference_prop_data)]

rfq_data_cleaned = rfq_data.drop(columns=rfq_drop_cols)
reference_prop_data_cleaned = reference_prop_data.drop(columns=reference_prop_data_drop_cols)

### Task B.1

In [6]:
# Capitalize first letter of each column name

rfq_data_cleaned.columns = rfq_data_cleaned.columns.str.capitalize()
reference_prop_data_cleaned.columns = reference_prop_data_cleaned.columns.str.capitalize()

- After capitilizing every first letter of each column in both datasets to maintain a consistent naming standard, we will be renaming the `Grade/material` column to `Grade` for joining both of the tables. 

In [7]:
# Rename 'Grade/material' to 'Grade' for easier merging

reference_prop_data_cleaned.rename(columns={'Grade/material': 'Grade'}, inplace=True)

In [8]:
reference_prop_data_cleaned

Unnamed: 0,Grade,Standards,Carbon (c),Manganese (mn),Silicon (si),Sulfur (s),Phosphorus (p),Aluminum (al),Tensile strength (rm),Yield strength (re or rp0.2),Elongation (a%),Source_pages,Application,Category
0,S235JR,EN 10025-2:2019,≤0.17,≤1.40,≤0.40,≤0.035,≤0.035,,360-510 MPa,≥235 MPa,≥26%,Standard Specifications,General structural steels,Structural Steel
1,S275JR,EN 10025-2:2019,≤0.21,≤1.50,≤0.40,≤0.035,≤0.035,,410-560 MPa,≥275 MPa,≥23%,Standard Specifications,General structural steels,Structural Steel
2,S355JR,EN 10025-2:2019,≤0.24,≤1.60,≤0.55,≤0.035,≤0.035,,470-630 MPa,≥355 MPa,≥22%,Standard Specifications,General structural steels,Structural Steel
3,S420M,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,,520-680 MPa,≥420 MPa,≥19%,Standard Specifications,Thermomechanically rolled steels,High Strength Steel
4,S460M,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,,540-720 MPa,≥460 MPa,≥17%,Standard Specifications,Thermomechanically rolled steels,High Strength Steel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,HX300LAD,Automotive specification,≤0.12,0.60-1.00,≤0.50,≤0.015,≤0.025,0.015-0.070,380-480 MPa,≥300 MPa,≥27%,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel
171,HX340LAD,Automotive specification,≤0.12,0.70-1.20,≤0.50,≤0.015,≤0.025,0.015-0.070,410-510 MPa,≥340 MPa,≥26%,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel
172,HX380,Automotive specification,≤0.12,0.70-1.40,≤0.50,≤0.015,≤0.025,0.015-0.070,450-580 MPa,≥380 MPa,≥23%,Standard Specifications,Automotive high strength steel,HSLA Steel
173,HX420,Automotive specification,≤0.12,0.70-1.60,≤0.50,≤0.015,≤0.025,0.015-0.070,480-620 MPa,≥420 MPa,≥21%,Standard Specifications,Automotive high strength steel,HSLA Steel


In [9]:
# Setting all the values in Grade column for both datasets to uppercase to maintain consistency

rfq_data_cleaned['Grade'] = rfq_data_cleaned['Grade'].str.upper()
reference_prop_data_cleaned['Grade'] = reference_prop_data_cleaned['Grade'].str.upper()

In [10]:
# Fill NaN values in Grade column in rfq dataset with 'Unknown'

rfq_data_cleaned['Grade'] = rfq_data_cleaned['Grade'].fillna('Unknown')

In [11]:
reference_prop_data_cleaned

Unnamed: 0,Grade,Standards,Carbon (c),Manganese (mn),Silicon (si),Sulfur (s),Phosphorus (p),Aluminum (al),Tensile strength (rm),Yield strength (re or rp0.2),Elongation (a%),Source_pages,Application,Category
0,S235JR,EN 10025-2:2019,≤0.17,≤1.40,≤0.40,≤0.035,≤0.035,,360-510 MPa,≥235 MPa,≥26%,Standard Specifications,General structural steels,Structural Steel
1,S275JR,EN 10025-2:2019,≤0.21,≤1.50,≤0.40,≤0.035,≤0.035,,410-560 MPa,≥275 MPa,≥23%,Standard Specifications,General structural steels,Structural Steel
2,S355JR,EN 10025-2:2019,≤0.24,≤1.60,≤0.55,≤0.035,≤0.035,,470-630 MPa,≥355 MPa,≥22%,Standard Specifications,General structural steels,Structural Steel
3,S420M,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,,520-680 MPa,≥420 MPa,≥19%,Standard Specifications,Thermomechanically rolled steels,High Strength Steel
4,S460M,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,,540-720 MPa,≥460 MPa,≥17%,Standard Specifications,Thermomechanically rolled steels,High Strength Steel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,HX300LAD,Automotive specification,≤0.12,0.60-1.00,≤0.50,≤0.015,≤0.025,0.015-0.070,380-480 MPa,≥300 MPa,≥27%,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel
171,HX340LAD,Automotive specification,≤0.12,0.70-1.20,≤0.50,≤0.015,≤0.025,0.015-0.070,410-510 MPa,≥340 MPa,≥26%,Standard Specifications,Automotive HSLA with improved drawability,HSLA Steel
172,HX380,Automotive specification,≤0.12,0.70-1.40,≤0.50,≤0.015,≤0.025,0.015-0.070,450-580 MPa,≥380 MPa,≥23%,Standard Specifications,Automotive high strength steel,HSLA Steel
173,HX420,Automotive specification,≤0.12,0.70-1.60,≤0.50,≤0.015,≤0.025,0.015-0.070,480-620 MPa,≥420 MPa,≥21%,Standard Specifications,Automotive high strength steel,HSLA Steel


- Now, we will be parsing range strings into numeric min, max and mid values.

In [12]:
# Declaring the columns which have range values in reference properties dataset

elements_cols = [
    'Carbon (c)',
    'Manganese (mn)',
    'Silicon (si)',
    'Sulfur (s)',
    'Phosphorus (p)',
    'Aluminum (al)',
    'Elongation (a%)'
]
range_symbols = ['-', '≤', '<', '>', '≥', '%']

In [13]:
reference_prop_data_cleaned[elements_cols] = reference_prop_data_cleaned[elements_cols].fillna('Unknown')

In [14]:
# Creating min, max and mid columns for each of the elements columns

def create_min_max_mid(value: str):
    if any(element in value for element in range_symbols) and value != 'Unknown':
        if '-' in value:
            min_val, max_val = value.replace("MPa", '').replace('%', '').split('-')
            return float(min_val), float(max_val), (float(min_val) + float(max_val)) / 2
        elif '≤' in value or '<' in value:
            if '%' in value:
                value = value.replace('%', '')
                max_val = float(value.replace('≤', '').replace('<', ''))
                return None, max_val / 100, None
            else:
                value = value.replace("MPa", '').replace('≤', '').replace('<', '')
                max_val = float(value)
                return None, max_val, None
        elif '≥' in value or '>' in value:
            if '%' in value:
                value = value.replace('%', '')
                min_val = float(value.replace('≥', '').replace('>', ''))
                return min_val / 100, None, None
            else:
                value = value.replace("MPa", '').replace('≥', '').replace('>', '')
                min_val = float(value)
                return min_val, None, None
    else:
        return None, None, None
        
def create_min_max_mid_columns(df, cols):
    for col in cols:
        min_col = f'{col}_min'
        max_col = f'{col}_max'
        mid_col = f'{col}_mid'
        
        df[[min_col, max_col, mid_col]] = df[col].apply(lambda x: pd.Series(create_min_max_mid(x)))
    return df


In [15]:
# Creating min, max and mid columns for each of the elements columns in reference properties dataset

reference_prop_data_cleaned = create_min_max_mid_columns(reference_prop_data_cleaned, elements_cols)
reference_prop_data_cleaned.drop(columns=elements_cols, inplace=True)

- Joining both of the datasets with the `Grade` column.

In [16]:
# Merging both of the datasets with the `Grade` column.

merged_data = reference_prop_data_cleaned.merge(
    rfq_data_cleaned, 
    on='Grade', 
    how='right', 
    suffixes=('_ref', '_rfq')
)

In [17]:
merged_data.reset_index()

Unnamed: 0,index,Grade,Standards,Tensile strength (rm),Yield strength (re or rp0.2),Source_pages,Application,Category,Carbon (c)_min,Carbon (c)_max,...,Id,Coating,Finish,Form,Thickness_min,Thickness_max,Width_min,Width_max,Weight_min,Weight_max
0,0,S700MC,EN 10149-2:2013,750-950 MPa,≥700 MPa,Standard Specifications,"Cold forming, automotive, high strength applic...",Microalloyed Steel,,0.12,...,8aff426d-b8c0-43aa-ad26-835ef4de6129,,Oiled,Coils,6.00,6.0,600.0,1520.0,15000.0,25000.0
1,1,S250GD,EN 10346:2015,330-510 MPa,≥250 MPa,Standard Specifications,Structural galvanized steel,Galvanized Steel,,0.25,...,37e624be-b125-464f-85b6-1838530193ef,ZM310,Hot-dip zinc magnesium (+ZM),Slit Coils,1.50,1.5,327.0,327.0,,
2,2,DX51D,EN 10346:2015,270-500 MPa,≤350 MPa,Standard Specifications,Galvanized steel for forming,Galvanized Steel,,0.12,...,b8257184-6307-46ab-b06e-d979336d1263,Z100,Hot-dip Galvanized (+Z/+GI),Coils,0.40,0.4,1000.0,1500.0,,
3,3,S235,EN 10025-2:2019,360-510 MPa,≥235 MPa,Standard Specifications,General structural steel,Structural Steel,,0.17,...,63140d1f-dda8-40fe-8931-bcaba65d5772,,,Round Tubes,1.50,1.5,,,53800.0,53800.0
4,4,S235,EN 10025-2:2019,360-510 MPa,≥235 MPa,Standard Specifications,General structural steel,Structural Steel,,0.17,...,11cffc57-44be-4d79-bfd5-97482be566d3,,,Round Tubes,1.50,1.5,,,14500.0,14500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1000,DX51D,EN 10346:2015,270-500 MPa,≤350 MPa,Standard Specifications,Galvanized steel for forming,Galvanized Steel,,0.12,...,7a965ba4-be4e-4833-a723-1f98ed1f24ac,Z080,Hot-dip Galvanized (+Z/+GI),Coils,0.55,0.6,,,,
1001,1001,DD11,EN 10111:2008,270-410 MPa,≤380 MPa,Standard Specifications,General cold forming,Cold Rolled Steel,,0.12,...,82fc492f-0adc-49bd-8458-69bb1f0f318e,,Hot Rolled,Coils,,,,1520.0,,25000.0
1002,1002,S550MC,EN 10149-2:2013,600-760 MPa,≥550 MPa,Standard Specifications,Cold forming microalloyed steel,Microalloyed Steel,,0.12,...,d3db6f2f-6218-4612-8a63-c26d4ac69658,,Hot Rolled,Coils,3.40,3.6,,1520.0,25000.0,35000.0
1003,1003,DD11,EN 10111:2008,270-410 MPa,≤380 MPa,Standard Specifications,General cold forming,Cold Rolled Steel,,0.12,...,90b02a62-3da9-4648-948c-52627d086a50,,Hot Rolled,Coils,2.40,2.6,600.0,1520.0,20000.0,25000.0


In [18]:
# Dropping columns with more than 60% NaN values

merged_data_drop_cols = [col for col in merged_data.columns if merged_data[col].isna().sum() > 0.7 * len(merged_data)]
merged_data_cleaned = merged_data.drop(columns=merged_data_drop_cols)

In [19]:
merged_data_obj_cols = merged_data_cleaned.select_dtypes(include=['object']).columns
merged_data_num_cols = merged_data_cleaned.select_dtypes(include=['number']).columns

In [20]:
# Imputing missing values in numerical columns with mean and categorical columns with 'Unknown'

merged_data_cleaned[merged_data_num_cols] = merged_data_cleaned[merged_data_num_cols].fillna(merged_data_cleaned[merged_data_num_cols].mean())
merged_data_cleaned[merged_data_obj_cols] = merged_data_cleaned[merged_data_obj_cols].fillna('Unknown')
merged_data_cleaned.drop_duplicates(inplace=True)

### Task B.2

In [21]:
merged_data_cleaned

Unnamed: 0,Grade,Standards,Tensile strength (rm),Yield strength (re or rp0.2),Source_pages,Application,Category,Carbon (c)_max,Manganese (mn)_max,Silicon (si)_max,...,Id,Coating,Finish,Form,Thickness_min,Thickness_max,Width_min,Width_max,Weight_min,Weight_max
0,S700MC,EN 10149-2:2013,750-950 MPa,≥700 MPa,Standard Specifications,"Cold forming, automotive, high strength applic...",Microalloyed Steel,0.12,2.10,0.600000,...,8aff426d-b8c0-43aa-ad26-835ef4de6129,Unknown,Oiled,Coils,6.000000,6.000000,600.000000,1520.000000,15000.000000,25000.000000
1,S250GD,EN 10346:2015,330-510 MPa,≥250 MPa,Standard Specifications,Structural galvanized steel,Galvanized Steel,0.25,1.20,0.480295,...,37e624be-b125-464f-85b6-1838530193ef,ZM310,Hot-dip zinc magnesium (+ZM),Slit Coils,1.500000,1.500000,327.000000,327.000000,112685.220709,83115.184535
2,DX51D,EN 10346:2015,270-500 MPa,≤350 MPa,Standard Specifications,Galvanized steel for forming,Galvanized Steel,0.12,0.60,0.480295,...,b8257184-6307-46ab-b06e-d979336d1263,Z100,Hot-dip Galvanized (+Z/+GI),Coils,0.400000,0.400000,1000.000000,1500.000000,112685.220709,83115.184535
3,S235,EN 10025-2:2019,360-510 MPa,≥235 MPa,Standard Specifications,General structural steel,Structural Steel,0.17,1.40,0.400000,...,63140d1f-dda8-40fe-8931-bcaba65d5772,Unknown,Unknown,Round Tubes,1.500000,1.500000,963.121627,1277.542813,53800.000000,53800.000000
4,S235,EN 10025-2:2019,360-510 MPa,≥235 MPa,Standard Specifications,General structural steel,Structural Steel,0.17,1.40,0.400000,...,11cffc57-44be-4d79-bfd5-97482be566d3,Unknown,Unknown,Round Tubes,1.500000,1.500000,963.121627,1277.542813,14500.000000,14500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,DX51D,EN 10346:2015,270-500 MPa,≤350 MPa,Standard Specifications,Galvanized steel for forming,Galvanized Steel,0.12,0.60,0.480295,...,7a965ba4-be4e-4833-a723-1f98ed1f24ac,Z080,Hot-dip Galvanized (+Z/+GI),Coils,0.550000,0.600000,963.121627,1277.542813,112685.220709,83115.184535
1001,DD11,EN 10111:2008,270-410 MPa,≤380 MPa,Standard Specifications,General cold forming,Cold Rolled Steel,0.12,0.60,0.480295,...,82fc492f-0adc-49bd-8458-69bb1f0f318e,Unknown,Hot Rolled,Coils,5.734683,5.947151,963.121627,1520.000000,112685.220709,25000.000000
1002,S550MC,EN 10149-2:2013,600-760 MPa,≥550 MPa,Standard Specifications,Cold forming microalloyed steel,Microalloyed Steel,0.12,1.80,0.500000,...,d3db6f2f-6218-4612-8a63-c26d4ac69658,Unknown,Hot Rolled,Coils,3.400000,3.600000,963.121627,1520.000000,25000.000000,35000.000000
1003,DD11,EN 10111:2008,270-410 MPa,≤380 MPa,Standard Specifications,General cold forming,Cold Rolled Steel,0.12,0.60,0.480295,...,90b02a62-3da9-4648-948c-52627d086a50,Unknown,Hot Rolled,Coils,2.400000,2.600000,600.000000,1520.000000,20000.000000,25000.000000


### Task B.2

- We will be considering the following dimension columns -`Width`, `Thickness` and `Weight`.

In [22]:
# Assigning the dimension columns

dimension_cols = [
    'Width', 'Thickness', 'Weight'
]

In [23]:
# Creating min and max for all dimension columns

for dim_col in dimension_cols:
    min_col = f"{dim_col}_min"
    max_col = f"{dim_col}_max"

    if min_col in merged_data_cleaned.columns and max_col in merged_data_cleaned.columns:
        print(f"Creating mid and range columns for {dim_col}")
        merged_data_cleaned[f"{dim_col}_mid"] = merged_data_cleaned[[min_col, max_col]].mean(axis=1)
        merged_data_cleaned[f"{dim_col}_range"] = merged_data_cleaned[max_col] - merged_data_cleaned[min_col]

    

Creating mid and range columns for Width
Creating mid and range columns for Thickness
Creating mid and range columns for Weight


- For categorical columns, we could only consider `Finish`, `Form` and `Coating` as `surface_type` had a lot value of missing.

In [24]:
categorical_cols = [
    'Finish', 'Form', 'Coating'
]
for col in categorical_cols:
    assert col in merged_data_cleaned.columns

In [25]:
merged_data_cleaned = create_min_max_mid_columns(merged_data_cleaned, ['Tensile strength (rm)', 'Yield strength (re or rp0.2)'])

In [26]:
merged_data_cleaned['Yield strength (re or rp0.2)_mid']

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
1000   NaN
1001   NaN
1002   NaN
1003   NaN
1004   NaN
Name: Yield strength (re or rp0.2)_mid, Length: 1003, dtype: float64

- We will be dropping the `Yield strength (re or rp0.2)_mid` column as it is full of `NaN` values.

In [27]:
merged_data_cleaned.drop(columns=['Yield strength (re or rp0.2)_mid'], inplace=True)

### Task B.3

- We assign the weights for dimension overlap, categorical match and grade similarities as `0.3`,`0.3` and `0.4`, giving a bit higher weightage to the grade as it might be more important.

In [28]:
# Creating aggregated similarity score function. Weights are taken randomly.

def aggregated_similarity(df, dim_cols, cat_cols, grade_col='Grade'):
    dim_overlap_wt = 0.3
    cat_match_wt = 0.3
    grade_sim_wt = 0.4
    
    def calculate_dimensional_overlap(row1, row2, dim_cols):
        overlaps = []
        
        for col in dim_cols:
            min_col = f"{col}_min"
            max_col = f"{col}_max"
            
            if min_col in df.columns and max_col in df.columns:
                range1_min = row1[min_col]
                range1_max = row1[max_col]
                range2_min = row2[min_col]
                range2_max = row2[max_col]
                
                if pd.isna(range1_min) or pd.isna(range1_max) or pd.isna(range2_min) or pd.isna(range2_max):
                    continue
                
                overlap_start = max(range1_min, range2_min)
                overlap_end = min(range1_max, range2_max)
                
                if overlap_start <= overlap_end:
                    overlap_size = overlap_end - overlap_start
                    total_range = max(range1_max, range2_max) - min(range1_min, range2_min)
                    overlap_ratio = overlap_size / total_range if total_range > 0 else 0
                    overlaps.append(overlap_ratio)
        
        return np.mean(overlaps) if overlaps else 0
    
    def calculate_similarity(row_1, row_2, cat_cols):
        matches = []
        for col in cat_cols:
            matches.append(1 if row_1[col] == row_2[col] else 0)

        return np.mean(matches) if matches else 0    

    def calculate_grade_similarity(grade1, grade2):
        if pd.isna(grade1) or pd.isna(grade2):
            return 0
        
        if grade1 == grade2:
            return 1.0
        
        # Extract base grade (e.g., "S235" from "S235JR")
        def extract_base_grade(grade):
            import re
            match = re.match(r'^([A-Z]+\d+)', str(grade))
            return match.group(1) if match else str(grade)
        
        base1 = extract_base_grade(grade1)
        base2 = extract_base_grade(grade2)
        
        if base1 == base2:
            return 0.8  # Same base grade, different suffix
        
        # Check if same family (e.g., both start with 'S')
        family1 = str(grade1)[0] if len(str(grade1)) > 0 else ''
        family2 = str(grade2)[0] if len(str(grade2)) > 0 else ''
        
        if family1 == family2 and family1.isalpha():
            return 0.3  # Same family
        
        return 0
    
    # Calculate similarity matrix
    n_rows = len(df)
    similarity_matrix = np.zeros((n_rows, n_rows))
    
    for i in range(n_rows):
        for j in range(n_rows):
            if i != j:
                row1 = df.iloc[i]
                row2 = df.iloc[j]
               
                is_exact_duplicate = all(
                    row1[col] == row2[col] 
                    for col in ['Grade', 'Category'] + cat_cols + dim_cols
                    if col in df.columns and pd.notna(row1[col]) and pd.notna(row2[col])
                )
                
                if is_exact_duplicate:
                    similarity_matrix[i, j] = 0
                    continue
                
                dim_overlap = calculate_dimensional_overlap(row1, row2, dim_cols)
                cat_matches = calculate_similarity(row1, row2, cat_cols)
                grade_sim = calculate_grade_similarity(row1[grade_col], row2[grade_col])
                
                total_similarity = (
                    dim_overlap * dim_overlap_wt +
                    cat_matches * cat_match_wt +
                    grade_sim * grade_sim_wt
                )
                
                similarity_matrix[i, j] = total_similarity
    
    return similarity_matrix

def find_top3_similar_rfqs(df, similarity_matrix):
    results = []
    
    for i in range(len(df)):
        similarities = similarity_matrix[i, :]
        
        valid_indices = [(j, score) for j, score in enumerate(similarities) 
                        if j != i and score > 0]
        
        top_similar = sorted(valid_indices, key=lambda x: x[1], reverse=True)[:3]
        
        result_row = {
            'original_index': i,
            'original_id': df.iloc[i].get('Id', f'RFQ_{i}'),
            'original_grade': df.iloc[i].get('Grade', 'Unknown'),
            'original_category': df.iloc[i].get('Category', 'Unknown'),
            'original_application': df.iloc[i].get('Application', 'Unknown')
        }
        
        for rank, (similar_idx, score) in enumerate(top_similar, 1):
            result_row[f'similar_{rank}_index'] = similar_idx
            result_row[f'similar_{rank}_id'] = df.iloc[similar_idx].get('Id', f'RFQ_{similar_idx}')
            result_row[f'similar_{rank}_score'] = round(score, 4)
            result_row[f'similar_{rank}_grade'] = df.iloc[similar_idx].get('Grade', 'Unknown')
            result_row[f'similar_{rank}_category'] = df.iloc[similar_idx].get('Category', 'Unknown')
        
        for rank in range(len(top_similar) + 1, 4):
            result_row[f'similar_{rank}_index'] = None
            result_row[f'similar_{rank}_id'] = None
            result_row[f'similar_{rank}_score'] = None
            result_row[f'similar_{rank}_grade'] = None
            result_row[f'similar_{rank}_category'] = None
        
        results.append(result_row)
    
    return pd.DataFrame(results)


In [29]:
def analyze_rfq_similarity(df, dim_cols, cat_cols, grade_col='Grade'):
    print(f"Analyzing similarity for {len(df)} RFQs...")
    
    similarity_matrix = aggregated_similarity(df, dim_cols, cat_cols, grade_col)
    top3_results = find_top3_similar_rfqs(df, similarity_matrix)
    
    print(f"Analysis complete. Found top-3 similar RFQs for each of {len(df)} records.")
    
    return top3_results, similarity_matrix

In [30]:
top_3_full, sm = analyze_rfq_similarity(merged_data_cleaned, dim_cols=dimension_cols, cat_cols=categorical_cols)

Analyzing similarity for 1003 RFQs...
Analysis complete. Found top-3 similar RFQs for each of 1003 records.


In [31]:
top_3_full.head()

Unnamed: 0,original_index,original_id,original_grade,original_category,original_application,similar_1_index,similar_1_id,similar_1_score,similar_1_grade,similar_1_category,similar_2_index,similar_2_id,similar_2_score,similar_2_grade,similar_2_category,similar_3_index,similar_3_id,similar_3_score,similar_3_grade,similar_3_category
0,0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,Microalloyed Steel,"Cold forming, automotive, high strength applic...",101,b2bc544a-219d-4899-9fa4-191e3f475649,0.85,S700MC,Microalloyed Steel,525,59be311a-ec7d-4a7a-8881-0acec6e90f15,0.7816,S700MC,Microalloyed Steel,878,92727d04-a1fa-4f21-88a4-db85458f7f52,0.7816,S700MC,Microalloyed Steel
1,1,37e624be-b125-464f-85b6-1838530193ef,S250GD,Galvanized Steel,Structural galvanized steel,942,a3f38767-02ae-4990-be18-35ca819684a7,0.6,S250GD,Galvanized Steel,969,56c75e72-50ae-4e29-aea4-f6394e1d986b,0.6,S250GD,Galvanized Steel,536,25c46875-dbeb-4ef4-ad33-7aff8384fb2a,0.4,S250GD,Galvanized Steel
2,2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,Galvanized Steel,Galvanized steel for forming,82,63078462-b3a8-463c-8c44-301ec9d0d024,0.8694,DX51D,Galvanized Steel,766,9aabfd4b-06e9-4c02-9c6a-befd8c6f03e0,0.8694,DX51D,Galvanized Steel,889,a81228e6-1d54-44f6-a601-2a596c2793bc,0.8694,DX51D,Galvanized Steel
3,3,63140d1f-dda8-40fe-8931-bcaba65d5772,S235,Structural Steel,General structural steel,29,4d1d8f30-346e-47a5-aed6-06a86e35f6d8,0.82,S235JR,Structural Steel,716,bed7c304-ce6c-4485-8629-574a2dbe32b3,0.82,S235JR,Structural Steel,717,cea2ffdc-7e8e-4697-acc0-f477287b6fb0,0.82,S235JR,Structural Steel
4,4,11cffc57-44be-4d79-bfd5-97482be566d3,S235,Structural Steel,General structural steel,29,4d1d8f30-346e-47a5-aed6-06a86e35f6d8,0.82,S235JR,Structural Steel,716,bed7c304-ce6c-4485-8629-574a2dbe32b3,0.82,S235JR,Structural Steel,717,cea2ffdc-7e8e-4697-acc0-f477287b6fb0,0.82,S235JR,Structural Steel


In [32]:
# Recreating the top3 dataset

top_3_rows_to_add = []

for row in top_3_full.iterrows():
    for k in range(1, 4):
        top_3_rows_to_add.append({
            'rfq_id': row[1]['original_id'],
            'match_id': row[1][f'similar_{k}_id'],
            'similarity_score': row[1][f'similar_{k}_score']
        })

top_3 = pd.DataFrame(top_3_rows_to_add)

In [33]:
top_3.to_csv('top3.csv', index=False)