# Dataset Cleaning

In [7]:
# useful libraries
import json
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 1. Uploading dataset

In [8]:
# Path to the keys.json file (from /data to /config)
KEYS_PATH = os.path.join("..", "config", "keys.json")

with open(KEYS_PATH, "r", encoding="utf-8") as f:
    keys = json.load(f)

HF_TOKEN = keys.get("hf_token")

In [9]:
# dataset 
df = pd.read_parquet(
    "hf://datasets/MoSBAIHI/weld-quality-dataset/data/train-00000-of-00001.parquet",
    engine="fastparquet",
    storage_options={"token": HF_TOKEN}
)

In [10]:
df 

Unnamed: 0,Carbon concentration / (weight%),Silicon concentration / (weight%),Manganese concentration / (weight%),Sulphur concentration / (weight%),Phosphorus concentration / (weight%),Nickel concentration / (weight%),Chromium concentration / (weight%),Molybdenum concentration / (weight%),Vanadium concentration / (weight%),Copper concentration / (weight%),...,Charpy temperature / °C,Charpy impact toughness / J,Hardness / kgmm-2,50 % FATT,Primary ferrite in microstructure / %,Ferrite with second phase / %,Acicular ferrite / %,Martensite / %,Ferrite with carbide aggreagate / %,Weld ID
0,0.037,0.30,0.65,0.008,0.012,0,N,N,N,N,...,N,N,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aaw
1,0.037,0.30,0.65,0.008,0.012,0,N,N,N,N,...,-28,100,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aawch
2,0.037,0.30,0.65,0.008,0.012,0,N,N,N,N,...,-38,100,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aht
3,0.037,0.31,1.03,0.007,0.014,0,N,N,N,N,...,N,N,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Baw
4,0.037,0.31,1.03,0.007,0.014,0,N,N,N,N,...,-48,100,N,N,32,28,40,0,0,Evans-Ni/CMn-1990/1991-0Bawch
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,0.100,0.35,0.90,0.008,0.016,0.60,8.6,0.98,0.18,N,...,N,N,N,N,N,N,N,N,N,Birmingham-MAX35
1648,0.088,0.36,0.88,0.008,0.017,0.57,8.4,0.94,0.19,N,...,N,N,N,N,N,N,N,N,N,Birmingham-MAX36
1649,0.090,0.34,0.89,0.008,0.016,0.17,8.2,0.94,0.02,N,...,N,N,N,N,N,N,N,N,N,Birmingham-MAX37
1650,0.092,0.35,0.90,0.008,0.016,0.54,8.4,0.97,0.17,N,...,N,N,N,N,N,N,N,N,N,Birmingham-MAX38


## 2. Dataset Cleaning

### 2.1. General Overview

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 44 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Carbon concentration / (weight%)                       1652 non-null   float64
 1   Silicon concentration / (weight%)                      1652 non-null   float64
 2   Manganese concentration / (weight%)                    1652 non-null   float64
 3   Sulphur concentration / (weight%)                      1652 non-null   object 
 4   Phosphorus concentration / (weight%)                   1652 non-null   object 
 5   Nickel concentration / (weight%)                       1652 non-null   object 
 6   Chromium concentration / (weight%)                     1652 non-null   object 
 7   Molybdenum concentration / (weight%)                   1652 non-null   object 
 8   Vanadium concentration / (weight%)              

In [12]:
#df.describe(include='all').transpose()

### 2.2. Initial Cleaning

We notice that 'N' is used to speicify Nan values => replace 'N' by `None`

In [13]:
df = df.replace('N', None)

In [14]:
#df.describe(include='all').transpose()

We split columns into categories to simplify analysis later

In [15]:
categories = {
    "composition": [
        'Carbon concentration / (weight%)', 
        'Silicon concentration / (weight%)',
        'Manganese concentration / (weight%)',
        'Sulphur concentration / (weight%)',
        'Phosphorus concentration / (weight%)',
        'Nickel concentration / (weight%)',
        'Chromium concentration / (weight%)',
        'Molybdenum concentration / (weight%)',
        'Vanadium concentration / (weight%)',
        'Copper concentration / (weight%)', 
        'Cobalt concentration / (weight%)',
        'Tungsten concentration / (weight%)',
        'Oxygen concentration / parts per million by weight',
        'Titanium concentration / parts per million by weight',
        'Nitrogen concentration / parts per million by weight',
        'Aluminium concentration / parts per million by weight',
        'Boron concentration / parts per million by weight',
        'Niobium concentration / parts per million by weight',
        'Tin concentration / parts per million by weight',
        'Arsenic concentration / parts per million by weight',
        'Antimony concentration / parts per million by weight'
    ],
    "welding_parameters": [
        'Current / A',
        'Voltage / V', 
        'AC or DC', 
        'Electrode positive or negative',
        'Heat input / kJmm-1', 
        'Interpass temperature / °C', 
        'Type of weld;',
        'Post weld heat treatment temperature / °C',
        'Post weld heat treatment time / hours'
    ],
    "mechanical_properties": [
        'Yield strength / MPa',
        'Ultimate tensile strength / MPa', 
        'Elongation / %',
        'Reduction of Area / %', 'Charpy temperature / °C',
        'Charpy impact toughness / J', 
        'Hardness / kgmm-2', 
        '50 % FATT'
    ],
    "microstructure": [
        'Primary ferrite in microstructure / %',
        'Ferrite with second phase / %', 
        'Acicular ferrite / %',
        'Martensite / %', 
        'Ferrite with carbide aggreagate / %'
    ],
    "identifiers": [
        'Weld ID'
    ]
}


#### 2.2.1. Numeric columns type into float64

In [16]:
non_numeric = [
    'Weld ID', 'Type of weld;','AC or DC', 'Electrode positive or negative'
]

In [17]:
for col in df.columns:
    if col not in set(non_numeric):
        df[col] = (
            df[col].astype(str)                          # ensure string
            .str.replace(',', '.', regex=False)          # replace commas with dots
            .str.strip()                                 # strip spaces
        )
        df[col] = pd.to_numeric(df[col], errors='coerce')  # convert to float


#### 2.2.2. Mapping categorical columns

In [18]:
df[non_numeric].describe()

Unnamed: 0,Weld ID,Type of weld;,AC or DC,Electrode positive or negative
count,1652,1652,1437,1496
unique,1490,10,2,3
top,Pat-1981-S6/BX400,MMA,DC,+
freq,5,1140,1395,1451


In [19]:
type_weld_map = {
    'MMA' : 0, 
    'ShMA' : 1, 
    'FCA' : 2, 
    'SA' : 3, 
    'TSA' : 4, 
    'SAA' : 5, 
    'GTAA' : 6, 
    'GMAA' : 7, 
    'NGSAW' : 7,
    'NGGMA' : 8
}

ac_dc_map = {
    "AC" : 0,
    "DC" : 1
}

electrode_map = {
    "+" : 1,
    "0" : 0,
    "-" : -1
}

In [20]:
# Map Type of weld
df['Type of weld;'] = df['Type of weld;'].map(type_weld_map)

# Map AC or DC
df['AC or DC'] = df['AC or DC'].map(ac_dc_map)

# Map Electrode polarity
df['Electrode positive or negative'] = df['Electrode positive or negative'].map(electrode_map)

## 3. Saving clean dataset

In [33]:
#df.info()

In [32]:
df.to_csv("../analytics/data/clean_weld_quality_dataset.csv", index=False)