# Dataset Cleaning

In [1]:
# useful libraries
import json
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 1. Uploading dataset [[source]](https://www.phase-trans.msm.cam.ac.uk/map/data/materials/welddb-b.html)

In [2]:
# Path to the keys.json file (from /src/cleaning/ to /config)
KEYS_PATH = os.path.join("..", "..", "config", "keys.json")


with open(KEYS_PATH, "r", encoding="utf-8") as f:
    keys = json.load(f)

HF_TOKEN = keys.get("hf_token")

In [3]:
# dataset 
df = pd.read_parquet(
    "hf://datasets/MoSBAIHI/weld-quality-dataset/data/train-00000-of-00001.parquet",
    engine="fastparquet",
    storage_options={"token": HF_TOKEN}
)

In [4]:
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

Dataset loaded: 1652 rows, 44 columns


## 2. Dataset Cleaning

### 2.1. General Overview

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 44 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Carbon concentration / (weight%)                       1652 non-null   float64
 1   Silicon concentration / (weight%)                      1652 non-null   float64
 2   Manganese concentration / (weight%)                    1652 non-null   float64
 3   Sulphur concentration / (weight%)                      1652 non-null   object 
 4   Phosphorus concentration / (weight%)                   1652 non-null   object 
 5   Nickel concentration / (weight%)                       1652 non-null   object 
 6   Chromium concentration / (weight%)                     1652 non-null   object 
 7   Molybdenum concentration / (weight%)                   1652 non-null   object 
 8   Vanadium concentration / (weight%)              

In [7]:
column_name_mapping = {
    "Carbon concentration / (weight%)": "carbon_wt_pct",
    "Silicon concentration / (weight%)": "silicon_wt_pct",
    "Manganese concentration / (weight%)": "manganese_wt_pct",
    "Sulphur concentration / (weight%)": "sulfur_wt_pct",
    "Phosphorus concentration / (weight%)": "phosphorus_wt_pct",
    "Nickel concentration / (weight%)": "nickel_wt_pct",
    "Chromium concentration / (weight%)": "chromium_wt_pct",
    "Molybdenum concentration / (weight%)": "molybdenum_wt_pct",
    "Vanadium concentration / (weight%)": "vanadium_wt_pct",
    "Copper concentration / (weight%)": "copper_wt_pct",
    "Cobalt concentration / (weight%)": "cobalt_wt_pct",
    "Tungsten concentration / (weight%)": "tungsten_wt_pct",
    "Oxygen concentration / parts per million by weight": "oxygen_ppm",
    "Titanium concentration / parts per million by weight": "titanium_ppm",
    "Nitrogen concentration / parts per million by weight": "nitrogen_ppm",
    "Aluminium concentration / parts per million by weight": "aluminium_ppm",
    "Boron concentration / parts per million by weight": "boron_ppm",
    "Niobium concentration / parts per million by weight": "niobium_ppm",
    "Tin concentration / parts per million by weight": "tin_ppm",
    "Arsenic concentration / parts per million by weight": "arsenic_ppm",
    "Antimony concentration / parts per million by weight": "antimony_ppm",
    "Current / A": "current_A",
    "Voltage / V": "voltage_V",
    "AC or DC": "current_type",
    "Electrode positive or negative": "electrode_polarity",
    "Heat input / kJmm-1": "heat_input_kJmm",
    "Interpass temperature / °C": "interpass_temp_C",
    "Type of weld;": "weld_type",
    "Post weld heat treatment temperature / °C": "pwht_temp_C",
    "Post weld heat treatment time / hours": "pwht_time_h",
    "Yield strength / MPa": "yield_strength_MPa",
    "Ultimate tensile strength / MPa": "uts_MPa",
    "Elongation / %": "elongation_pct",
    "Reduction of Area / %": "reduction_area_pct",
    "Charpy temperature / °C": "charpy_temp_C",
    "Charpy impact toughness / J": "charpy_toughness_J",
    "Hardness / kgmm-2": "hardness_kgmm2",
    "50 % FATT": "fatt50_C",
    "Primary ferrite in microstructure / %": "primary_ferrite_pct",
    "Ferrite with second phase / %": "ferrite_second_phase_pct",
    "Acicular ferrite / %": "acicular_ferrite_pct",
    "Martensite / %": "martensite_pct",
    "Ferrite with carbide aggreagate / %": "ferrite_carbide_pct",
    "Weld ID": "weld_id"
}


In [8]:
df = df.rename(columns=column_name_mapping)

In [9]:
categories = {
    "composition": [
        "carbon_wt_pct",
        "silicon_wt_pct",
        "manganese_wt_pct",
        "sulfur_wt_pct",
        "phosphorus_wt_pct",
        "nickel_wt_pct",
        "chromium_wt_pct",
        "molybdenum_wt_pct",
        "vanadium_wt_pct",
        "copper_wt_pct",
        "cobalt_wt_pct",
        "tungsten_wt_pct",
        "oxygen_ppm",
        "titanium_ppm",
        "nitrogen_ppm",
        "aluminium_ppm",
        "boron_ppm",
        "niobium_ppm",
        "tin_ppm",
        "arsenic_ppm",
        "antimony_ppm"
    ],
    
    "welding_parameters": [
        "current_A",
        "voltage_V",
        "current_type",
        "electrode_polarity",
        "heat_input_kJmm",
        "interpass_temp_C",
        "weld_type",
        "pwht_temp_C",
        "pwht_time_h"
    ],
    
    "mechanical_properties": [
        "yield_strength_MPa",
        "uts_MPa",
        "elongation_pct",
        "reduction_area_pct",
        "charpy_temp_C",
        "charpy_toughness_J",
        "hardness_kgmm2",
        "fatt50_C"
    ],
    
    "microstructure": [
        "primary_ferrite_pct",
        "ferrite_second_phase_pct",
        "acicular_ferrite_pct",
        "martensite_pct",
        "ferrite_carbide_pct"
    ],
    
    "identifiers": [
        "weld_id"
    ]
}


### 2.2. Initial Cleaning

#### 2.2.1. Nan values

We notice that 'N' is used to speicify Nan values => replace 'N' by `None`

In [10]:
df = df.replace('N', None)
print("Replaced 'N' with None for missing values")

Replaced 'N' with None for missing values


In [29]:
#df.describe(include='all').transpose()

We split columns into categories to simplify analysis later

#### 2.2.2. Numeric columns type into float64

In [15]:
#df.info()

In [12]:
non_numeric = [
    'weld_id', 'weld_type','current_type', 'electrode_polarity'
]

In [13]:
# Convert numeric columns to float
for col in df.columns:
    if col not in set(non_numeric):
        df[col] = (
            df[col].astype(str)                          # ensure string
            .str.replace(',', '.', regex=False)          # replace commas with dots
            .str.strip()                                 # strip spaces
        )
        df[col] = pd.to_numeric(df[col], errors='coerce')  # convert to float


In [14]:
print("Converted numeric columns to float64")

Converted numeric columns to float64


#### 2.2.3. Mapping categorical columns

In [33]:
# Option 1: Label Encoding (uncomment if preferred)
# type_weld_map = {
#     'MMA': 0, 'ShMA': 1, 'FCA': 2, 'SA': 3, 
#     'TSA': 4, 'SAA': 5, 'GTAA': 6, 'GMAA': 7, 
#     'NGSAW': 8, 'NGGMA': 9
# }
# ac_dc_map = {'AC': 0, 'DC': 1}
# electrode_map = {'+': 1, '0': 0, '-': -1}
# 
# df['Type of weld;'] = df['Type of weld;'].map(type_weld_map)
# df['AC or DC'] = df['AC or DC'].map(ac_dc_map)
# df['Electrode positive or negative'] = df['Electrode positive or negative'].map(electrode_map)

In [16]:
# Option 2: One-Hot Encoding

if 'weld_type' in df.columns:
    weld_type_dummies = pd.get_dummies(df['weld_type'], prefix='weld_type', dummy_na=False, dtype=int)
    df = pd.concat([df, weld_type_dummies], axis=1)
    df.drop('weld_type', axis=1, inplace=True)
    print(f"Created {len(weld_type_dummies.columns)} dummy variables for weld_type")

if 'current_type' in df.columns:
    ac_dc_dummies = pd.get_dummies(df['current_type'], prefix='current_type', dummy_na=False, dtype=int)
    df = pd.concat([df, ac_dc_dummies], axis=1)
    df.drop('current_type', axis=1, inplace=True)
    print(f"Created {len(ac_dc_dummies.columns)} dummy variables for current_type")

if 'electrode_polarity' in df.columns:
    electrode_dummies = pd.get_dummies(df['electrode_polarity'], prefix='electrode_polarity', dummy_na=False, dtype=int)
    df = pd.concat([df, electrode_dummies], axis=1)
    df.drop('electrode_polarity', axis=1, inplace=True)  
    print(f"Created {len(electrode_dummies.columns)} dummy variables for electrode_polarity")

Created 10 dummy variables for weld_type
Created 2 dummy variables for current_type
Created 3 dummy variables for electrode_polarity


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   carbon_wt_pct             1652 non-null   float64
 1   silicon_wt_pct            1652 non-null   float64
 2   manganese_wt_pct          1652 non-null   float64
 3   sulfur_wt_pct             1641 non-null   float64
 4   phosphorus_wt_pct         1642 non-null   float64
 5   nickel_wt_pct             697 non-null    float64
 6   chromium_wt_pct           784 non-null    float64
 7   molybdenum_wt_pct         791 non-null    float64
 8   vanadium_wt_pct           620 non-null    float64
 9   copper_wt_pct             564 non-null    float64
 10  cobalt_wt_pct             108 non-null    float64
 11  tungsten_wt_pct           63 non-null     float64
 12  oxygen_ppm                1256 non-null   float64
 13  titanium_ppm              865 non-null    float64
 14  nitrogen

## 3. Saving clean dataset

In [18]:
df.drop('weld_id', axis=1, inplace=True)

In [19]:
# Save data
output_path = "../../data/clean_weld_quality_dataset.csv"
df.to_csv(output_path, index=False)
print(f"Saved encoded dataset to: {output_path}")

Saved encoded dataset to: ../../data/clean_weld_quality_dataset.csv
