In [1]:
import sys
assert sys.version_info >= (3, 5)
import sklearn
assert sklearn.__version__ >= "0.20"
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt
# Style options for plots.
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998).
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

### Loading data

In [11]:
from sklearn.utils import shuffle

# Load the dataset
weld_df = pd.read_csv("../../data/clean_weld_quality_dataset.csv")

# Shuffle the data to avoid bias when deleting labels
weld_df = shuffle(weld_df, random_state=1)

In [12]:
weld_df.columns

Index(['carbon_wt_pct', 'silicon_wt_pct', 'manganese_wt_pct', 'sulfur_wt_pct',
       'phosphorus_wt_pct', 'nickel_wt_pct', 'chromium_wt_pct',
       'molybdenum_wt_pct', 'vanadium_wt_pct', 'copper_wt_pct',
       'cobalt_wt_pct', 'tungsten_wt_pct', 'oxygen_ppm', 'titanium_ppm',
       'nitrogen_ppm', 'aluminium_ppm', 'boron_ppm', 'niobium_ppm', 'tin_ppm',
       'arsenic_ppm', 'antimony_ppm', 'current_A', 'voltage_V',
       'heat_input_kJmm', 'interpass_temp_C', 'pwht_temp_C', 'pwht_time_h',
       'yield_strength_MPa', 'uts_MPa', 'elongation_pct', 'reduction_area_pct',
       'charpy_temp_C', 'charpy_toughness_J', 'hardness_kgmm2', 'fatt50_C',
       'primary_ferrite_pct', 'ferrite_second_phase_pct',
       'acicular_ferrite_pct', 'martensite_pct', 'ferrite_carbide_pct',
       'weld_type_FCA', 'weld_type_GMAA', 'weld_type_GTAA', 'weld_type_MMA',
       'weld_type_NGGMA', 'weld_type_NGSAW', 'weld_type_SA', 'weld_type_SAA',
       'weld_type_ShMA', 'weld_type_TSA', 'current_type_AC',

### Defining target cols

In [13]:
TARGET_COLS = [
    "yield_strength_MPa",  # Stress at which plastic deformation begins; measures material strength.
    "uts_MPa",             # Ultimate tensile strength; maximum stress material can withstand before fracture.
    "elongation_pct",      # Percent elongation; measure of ductility (total strain before fracture).
    "reduction_area_pct",  # Percent reduction in cross-sectional area after fracture; another ductility measure.
    "charpy_temp_C",       # Test temperature for Charpy impact test; defines testing condition.
    "charpy_toughness_J",  # Charpy impact energy absorbed; indicates toughness and resistance to brittle fracture.
    "hardness_kgmm2",      # Surface hardness; correlates with strength and wear resistance.
    "fatt50_C"             # 50% Fracture Appearance Transition Temperature; temperature where 50% brittle fracture occurs.
]


In [14]:
X = weld_df.drop(columns=TARGET_COLS)
y = weld_df[TARGET_COLS]

In [16]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1652 entries, 161 to 1061
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   yield_strength_MPa  780 non-null    float64
 1   uts_MPa             738 non-null    float64
 2   elongation_pct      700 non-null    float64
 3   reduction_area_pct  705 non-null    float64
 4   charpy_temp_C       879 non-null    float64
 5   charpy_toughness_J  879 non-null    float64
 6   hardness_kgmm2      80 non-null     float64
 7   fatt50_C            31 non-null     float64
dtypes: float64(8)
memory usage: 116.2 KB


In [17]:
y = y.drop(columns=["hardness_kgmm2", "fatt50_C"]) # we exclude Hardness / kgmm-2 and 50 % FATT because they are too sparse

### Splitting data (train/val/test)

In [18]:
# dentifying labeled vs unlabeled samples
labeled_mask = ~y.isna().any(axis=1)
X_labeled = X[labeled_mask]
y_labeled = y[labeled_mask]
X_unlabeled = X[~labeled_mask]

print(f"Labeled samples: {len(X_labeled)}")
print(f"Unlabeled samples: {len(X_unlabeled)}")

Labeled samples: 134
Unlabeled samples: 1518


In [None]:

from sklearn.model_selection import train_test_split

# Split only labeled data into train/val/test
X_train_labeled, X_temp, y_train_labeled, y_temp = train_test_split(
    X_labeled, y_labeled, test_size=0.3, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(f"Training set (labeled)  : X={X_train_labeled.shape}, y={y_train_labeled.shape}")
print(f"Validation set           : X={X_val.shape}, y={y_val.shape}")
print(f"Test set                 : X={X_test.shape}, y={y_test.shape}")
print(f"Total samples check      : {len(X_train_labeled) + len(X_val) + len(X_test)}")


=== Dataset Split Summary ===
Training set (labeled)  : X=(93, 47), y=(93, 6)
Validation set           : X=(20, 47), y=(20, 6)
Test set                 : X=(21, 47), y=(21, 6)
Total samples check      : 134


In [24]:
# X_train 
X_train = pd.concat([X_train_labeled, X_unlabeled], axis=0)

# Create placeholder labels for unlabeled data
y_unlabeled = pd.DataFrame(
    np.nan, 
    index=X_unlabeled.index, 
    columns=y_train_labeled.columns
)

# Combine labeled + unlabeled targets
y_train = pd.concat([y_train_labeled, y_unlabeled], axis=0)

print(f"Train (labeled): {len(X_train_labeled)}")
print(f"Train (unlabeled): {len(X_unlabeled)}")
print(f"Val: {len(X_val)}")
print(f"Test: {len(X_test)}")

Train (labeled): 93
Train (unlabeled): 1518
Val: 20
Test: 21


### Saving data splits

In [25]:
import os

save_dir = "../../data/data_splits"
os.makedirs(save_dir, exist_ok=True)

# save labeled training data 
X_train_labeled.to_csv(os.path.join(save_dir, "X_train_labeled.csv"), index=False)
y_train_labeled.to_csv(os.path.join(save_dir, "y_train_labeled.csv"), index=False)

# save unlabeled training data

X_unlabeled.to_csv(os.path.join(save_dir, "X_train_unlabeled.csv"), index=False)
y_un = pd.DataFrame(
    np.nan, index=X_unlabeled.index, columns=y_train_labeled.columns
)
y_un.to_csv(os.path.join(save_dir, "y_train_unlabeled.csv"), index=False)

# save validation and test sets
X_val.to_csv(os.path.join(save_dir, "X_val.csv"), index=False)
y_val.to_csv(os.path.join(save_dir, "y_val.csv"), index=False)

X_test.to_csv(os.path.join(save_dir, "X_test.csv"), index=False)
y_test.to_csv(os.path.join(save_dir, "y_test.csv"), index=False)

print(f"All dataset splits saved to '{save_dir}' directory.")


All dataset splits saved to '../../data/data_splits' directory.
