# Dataset Cleaning

In [114]:
from pathlib import Path
import os, json, re
import pandas as pd
import numpy as np

In [115]:
KEYS_PATH = Path("..") / ".." / "config" / "keys.json"
HF_TOKEN = None
if KEYS_PATH.exists():
    with open(KEYS_PATH, "r", encoding="utf-8") as f:
        HF_TOKEN = json.load(f).get("hf_token")

In [116]:
if HF_TOKEN is None:
    raise RuntimeError("Aucun HF_TOKEN trouvé (ni keys.json, ni variable d'env).")

In [117]:
# Lecture parquet depuis Hugging Face
HF_PARQUET = "hf://datasets/MoSBAIHI/weld-quality-dataset/data/train-00000-of-00001.parquet"
df = pd.read_parquet(HF_PARQUET, storage_options={"token": HF_TOKEN})

In [118]:
# Normalise légèrement les entêtes (sans tout casser)
df.columns = df.columns.str.strip()

print("[INFO] Shape initial:", df.shape)
print("[INFO] Colonnes:", df.columns.tolist()[:30], "...")

[INFO] Shape initial: (1652, 44)
[INFO] Colonnes: ['Carbon concentration / (weight%)', 'Silicon concentration / (weight%)', 'Manganese concentration / (weight%)', 'Sulphur concentration / (weight%)', 'Phosphorus concentration / (weight%)', 'Nickel concentration / (weight%)', 'Chromium concentration / (weight%)', 'Molybdenum concentration / (weight%)', 'Vanadium concentration / (weight%)', 'Copper concentration / (weight%)', 'Cobalt concentration / (weight%)', 'Tungsten concentration / (weight%)', 'Oxygen concentration / parts per million by weight', 'Titanium concentration / parts per million by weight', 'Nitrogen concentration / parts per million by weight', 'Aluminium concentration / parts per million by weight', 'Boron concentration / parts per million by weight', 'Niobium concentration / parts per million by weight', 'Tin concentration / parts per million by weight', 'Arsenic concentration / parts per million by weight', 'Antimony concentration / parts per million by weight', 'Curr

In [119]:
# 1) Repérage robuste des colonnes catégorielles (weld type, AC/DC, polarité)
#    Le dataset HF peut avoir d'autres libellés → on match par motifs.
#-------------------------------

def find_col(candidates, patterns):
    """
    candidates: list of column names
    patterns: list of regex (lowercase) à tester
    return: premier nom de colonne qui matche l'un des patterns, sinon None
    """
    low = {c.lower(): c for c in candidates}
    for lc, orig in low.items():
        for p in patterns:
            if re.search(p, lc):
                return orig
    return None

In [120]:
cols = df.columns.tolist()
print(cols)

['Carbon concentration / (weight%)', 'Silicon concentration / (weight%)', 'Manganese concentration / (weight%)', 'Sulphur concentration / (weight%)', 'Phosphorus concentration / (weight%)', 'Nickel concentration / (weight%)', 'Chromium concentration / (weight%)', 'Molybdenum concentration / (weight%)', 'Vanadium concentration / (weight%)', 'Copper concentration / (weight%)', 'Cobalt concentration / (weight%)', 'Tungsten concentration / (weight%)', 'Oxygen concentration / parts per million by weight', 'Titanium concentration / parts per million by weight', 'Nitrogen concentration / parts per million by weight', 'Aluminium concentration / parts per million by weight', 'Boron concentration / parts per million by weight', 'Niobium concentration / parts per million by weight', 'Tin concentration / parts per million by weight', 'Arsenic concentration / parts per million by weight', 'Antimony concentration / parts per million by weight', 'Current / A', 'Voltage / V', 'AC or DC', 'Electrode po

In [121]:
# --- Motifs de détection (regex)
type_patterns = [
    r"type.*weld", r"weld.*type", r"\bprocess\b", r"welding.*process", r"\bweld.*proc"
]
acdc_patterns = [
    r"\bac.*dc\b", r"\bcurrent.*type\b", r"\b(ac|dc)\b", r"polarity.*(ac|dc)"
]
polarity_patterns = [
    r"polarit", r"positive|negative", r"elec.*(pos|neg)", r"dc\+|dc\-|ac\+|ac\-"
]

In [122]:
# --- Détection
col_type  = find_col(cols, type_patterns)
col_acdc  = find_col(cols, acdc_patterns)
col_polar = find_col(cols, polarity_patterns)

# --- Renommage vers des noms standards si besoin
rename_map = {}
if col_type and col_type != 'Type of weld;':
    rename_map[col_type] = 'Type of weld;'
if col_acdc and col_acdc != 'AC or DC':
    rename_map[col_acdc] = 'AC or DC'
if col_polar and col_polar != 'Electrode positive or negative':
    rename_map[col_polar] = 'Electrode positive or negative'

if rename_map:
    df = df.rename(columns=rename_map)
    cols = df.columns.tolist()
    print("[INFO] Renommage appliqué:", rename_map)

In [123]:
print("[INFO] Détection colonnes :",
      f"type={col_type}, ac/dc={col_acdc}, polarité={col_polar}")

[INFO] Détection colonnes : type=Type of weld;, ac/dc=AC or DC, polarité=Electrode positive or negative


In [124]:
# --- Liste des catégorielles qu’on protège
categoricals = [c for c in ['Type of weld;', 'AC or DC', 'Electrode positive or negative'] if c in df.columns]

# --- Parsing numérique (virgule→point) hors catégorielles
object_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
object_numeric_candidates = [c for c in object_cols if c not in categoricals]

if object_numeric_candidates:
    df[object_numeric_candidates] = (df[object_numeric_candidates]
                                     .apply(lambda s: s.astype(str)
                                            .str.replace(',', '.', regex=False)
                                            .str.strip()))
    for c in object_numeric_candidates:
        conv = pd.to_numeric(df[c], errors='coerce')
        if conv.notna().mean() > 0.05:
            df[c] = conv

In [125]:
# --- Imputations S/P si présentes
for col in ['Sulphur concentration / (weight%)','Phosphorus concentration / (weight%)']:
    if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
        df[col] = df[col].fillna(df[col].mean())

In [126]:
# --- Zéro sur éléments d’alliage (si colonnes présentes)
elements_to_zero = [
    "Nickel concentration / (weight%)","Chromium concentration / (weight%)",
    "Molybdenum concentration / (weight%)","Vanadium concentration / (weight%)",
    "Copper concentration / (weight%)","Cobalt concentration / (weight%)",
    "Tungsten concentration / (weight%)",
    "Titanium concentration / parts per million by weight",
    "Aluminium concentration / parts per million by weight",
    "Boron concentration / parts per million by weight",
    "Niobium concentration / parts per million by weight",
    "Tin concentration / parts per million by weight",
    "Arsenic concentration / parts per million by weight",
    "Antimony concentration / parts per million by weight",
]
present_zero_cols = [c for c in elements_to_zero if c in df.columns]
if present_zero_cols:
    df[present_zero_cols] = df[present_zero_cols].fillna(0)

In [127]:
# --- Conversion ppm → wt% (1 wt% = 10000 ppm)
ppm_cols = [
    "Titanium concentration / parts per million by weight",
    "Nitrogen concentration / parts per million by weight",
    "Aluminium concentration / parts per million by weight",
    "Boron concentration / parts per million by weight",
    "Niobium concentration / parts per million by weight",
    "Tin concentration / parts per million by weight",
    "Arsenic concentration / parts per million by weight",
    "Antimony concentration / parts per million by weight",
]
for col in ppm_cols:
    if col in df.columns:
        new_col = col.replace("parts per million by weight", "(wt%)")
        df[new_col] = pd.to_numeric(df[col], errors='coerce') / 1e4

In [128]:
# Standardisation & sauvegarde des colonnes brutes (catégorielles)
for c in ['Type of weld;', 'AC or DC', 'Electrode positive or negative']:
    if c in df.columns:
        df[c + '_raw'] = df[c]  # traçabilité
        df[c] = df[c].astype('string').str.strip().str.upper()

In [129]:
# Mappings & encodages
type_weld_map = {
    # Cambridge-like
    'MMA':0,'SHMA':1,'FCA':2,'SA':3,'TSA':4,'SAA':5,'GTAA':6,'GMAA':7,'NGSAW':7,'NGGMA':8,
    # Variantes usuelles / HF
    'SMAW':0,          # Shielded Metal Arc (≈ MMA)
    'GMAW':7,'MIG':7,  # Metal Inert/Active Gas
    'GTAW':6,'TIG':6,  # Tungsten Inert Gas
    'SAW':3            # Submerged Arc
}
ac_dc_map = {'AC':0, 'DC':1}
electrode_direct_map = {
    '+':1, '-':-1, '0':0, 'POSITIVE':1, 'NEGATIVE':-1,
    'DC+':1, 'DC-':-1, 'AC+':0, 'AC-':0  # AC neutralisé à 0
}

In [130]:
# --- Encodage Type of weld
if 'Type of weld;' in df.columns:
    df['type_weld_enc'] = df['Type of weld;'].map(type_weld_map)
    mask_na = df['type_weld_enc'].isna()
    if mask_na.any():
        tw = df.loc[mask_na, 'Type of weld;']
        df.loc[mask_na & tw.str.contains('SMAW', na=False), 'type_weld_enc'] = 0
        df.loc[mask_na & tw.str.contains('GMAW|MIG|MAG', na=False), 'type_weld_enc'] = 7
        df.loc[mask_na & tw.str.contains('GTAW|TIG', na=False), 'type_weld_enc'] = 6
        df.loc[mask_na & tw.str.contains('SAW|SUBMERGED', na=False), 'type_weld_enc'] = 3

In [131]:
# --- Encodage AC/DC (binaire)
if 'AC or DC' in df.columns:
    tmp = (df['AC or DC'].str.replace(r'\s+', '', regex=True)
                         .str.replace('ALTERNATINGCURRENT','AC', regex=False)
                         .str.replace('DIRECTCURRENT','DC', regex=False))
    tmp = np.where(tmp.str.contains('DC', na=False), 'DC',
          np.where(tmp.str.contains('AC', na=False), 'AC', tmp))
    df['acdc_enc'] = pd.Series(tmp, index=df.index).map(ac_dc_map).astype('Int64')

In [132]:
# --- Encodage polarité (+ fallback + neutralisation si AC)
if 'Electrode positive or negative' in df.columns:
    pol = df['Electrode positive or negative'].astype('string').str.strip()
    pol = pol.str.replace(r'\s+', '', regex=True).str.upper()

    pol_enc = pol.map(electrode_direct_map)

    # Fallback via regex sur présence de + / -
    fallback = pd.Series(
        np.where(pol.str.contains(r'\+', na=False), 1,
        np.where(pol.str.contains(r'-', na=False), -1, np.nan)),
        index=pol.index
    )
    pol_enc = pol_enc.combine_first(fallback)

    # AC ⇒ polarité neutralisée à 0
    if 'acdc_enc' in df.columns:
        pol_enc = pol_enc.mask(df['acdc_enc'] == 0, 0)

    df['polarity_enc'] = pol_enc.astype('Int64')

In [133]:
# Contrôles de couverture & valeurs non mappées
def coverage(col_name):
    if col_name in df.columns:
        miss = df[col_name].isna().sum()
        print(f"[COVER] {col_name}: {len(df)-miss}/{len(df)} encodées ({miss} NaN)")

coverage('type_weld_enc')
coverage('acdc_enc')
coverage('polarity_enc')

def show_unmapped(raw_col, enc_col, n=20):
    if raw_col in df.columns and enc_col in df.columns:
        vals = df.loc[df[enc_col].isna(), raw_col].dropna().unique()
        print(f"\n[UNMAPPED] {raw_col} → {enc_col} (top {n}):", vals[:n])

show_unmapped('Type of weld;', 'type_weld_enc')
show_unmapped('AC or DC', 'acdc_enc')
show_unmapped('Electrode positive or negative', 'polarity_enc')

[COVER] type_weld_enc: 1652/1652 encodées (0 NaN)
[COVER] acdc_enc: 1437/1652 encodées (215 NaN)
[COVER] polarity_enc: 1652/1652 encodées (0 NaN)

[UNMAPPED] Type of weld; → type_weld_enc (top 20): <StringArray>
[]
Length: 0, dtype: string

[UNMAPPED] AC or DC → acdc_enc (top 20): <StringArray>
['N']
Length: 1, dtype: string

[UNMAPPED] Electrode positive or negative → polarity_enc (top 20): <StringArray>
[]
Length: 0, dtype: string


In [134]:
# Export + récapitulatif
OUT_DIR = Path.cwd() / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)
out_path = OUT_DIR / "weld_quality_clean_from_hf.csv"
df.to_csv(out_path, index=False)

print("\n=== RÉCAP ===")
print("Shape final:", df.shape)
print("Top NaN ratio:\n", df.isna().mean().sort_values(ascending=False).head(10))
print(f"\n[OK] Fichier exporté → {out_path}")


=== RÉCAP ===
Shape final: (1652, 58)
Top NaN ratio:
 Ferrite with carbide aggreagate / %      0.946126
Martensite / %                           0.946126
Acicular ferrite / %                     0.945521
Ferrite with second phase / %            0.945521
Primary ferrite in microstructure / %    0.941889
Elongation / %                           0.576271
Reduction of Area / %                    0.573245
Ultimate tensile strength / MPa          0.553269
Yield strength / MPa                     0.527845
Charpy impact toughness / J              0.467918
dtype: float64

[OK] Fichier exporté → C:\Users\Guillaume PORET\PycharmProjects\pythonProject\Central\weld-quality\src\cleaning\outputs\weld_quality_clean_from_hf.csv


In [135]:
df

Unnamed: 0,Carbon concentration / (weight%),Silicon concentration / (weight%),Manganese concentration / (weight%),Sulphur concentration / (weight%),Phosphorus concentration / (weight%),Nickel concentration / (weight%),Chromium concentration / (weight%),Molybdenum concentration / (weight%),Vanadium concentration / (weight%),Copper concentration / (weight%),...,Niobium concentration / (wt%),Tin concentration / (wt%),Arsenic concentration / (wt%),Antimony concentration / (wt%),Type of weld;_raw,AC or DC_raw,Electrode positive or negative_raw,type_weld_enc,acdc_enc,polarity_enc
0,0.037,0.30,0.65,0.008,0.012,0.00,0.0,0.00,0.00,0.0,...,0.00,0.0,0.0,0.0,MMA,DC,+,0,1,1
1,0.037,0.30,0.65,0.008,0.012,0.00,0.0,0.00,0.00,0.0,...,0.00,0.0,0.0,0.0,MMA,DC,+,0,1,1
2,0.037,0.30,0.65,0.008,0.012,0.00,0.0,0.00,0.00,0.0,...,0.00,0.0,0.0,0.0,MMA,DC,+,0,1,1
3,0.037,0.31,1.03,0.007,0.014,0.00,0.0,0.00,0.00,0.0,...,0.00,0.0,0.0,0.0,MMA,DC,+,0,1,1
4,0.037,0.31,1.03,0.007,0.014,0.00,0.0,0.00,0.00,0.0,...,0.00,0.0,0.0,0.0,MMA,DC,+,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,0.100,0.35,0.90,0.008,0.016,0.60,8.6,0.98,0.18,0.0,...,0.00,0.0,0.0,0.0,SA,N,+,3,,0
1648,0.088,0.36,0.88,0.008,0.017,0.57,8.4,0.94,0.19,0.0,...,0.10,0.0,0.0,0.0,SA,N,+,3,,0
1649,0.090,0.34,0.89,0.008,0.016,0.17,8.2,0.94,0.02,0.0,...,0.00,0.0,0.0,0.0,SA,N,+,3,,0
1650,0.092,0.35,0.90,0.008,0.016,0.54,8.4,0.97,0.17,0.0,...,0.05,0.0,0.0,0.0,SA,N,+,3,,0
