In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [6]:
df = pd.read_csv('welddatabase/welddb_new.csv')
df.head(10)

Unnamed: 0,Carbon_%,Silicon_%,Manganese_%,Sulphur_%,Phosphorus_%,Nickel_%,Chromium_%,Molybdenum_%,Vanadium_%,Copper_%,...,Weld_Type_GMAA,Weld_Type_GTAA,Weld_Type_MMA,Weld_Type_NGGMA,Weld_Type_NGSAW,Weld_Type_SA,Weld_Type_SAA,Weld_Type_ShMA,Weld_Type_TSA,Heat_Input_J_mm
0,0.037,0.3,0.65,0.008,0.012,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0
1,0.037,0.3,0.65,0.008,0.012,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0
2,0.037,0.3,0.65,0.008,0.012,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0
3,0.037,0.31,1.03,0.007,0.014,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0
4,0.037,0.31,1.03,0.007,0.014,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0
5,0.037,0.31,1.03,0.007,0.014,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0
6,0.044,0.35,1.43,0.007,0.014,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0
7,0.044,0.35,1.43,0.007,0.014,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0
8,0.044,0.35,1.43,0.007,0.014,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0
9,0.045,0.33,1.85,0.007,0.016,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1000.0


In [7]:
missing_values = df.isnull().sum()

missing_values[missing_values > 0]

PWHT_Temp_C              13
PWHT_Time_hours          13
Yield_Strength_MPa      872
UTS_MPa                 914
Elongation_%            952
Reduction_Area_%        947
Charpy_Temp_C           773
Charpy_Energy_J         773
Hardness_kg_mm2        1514
FATT_50%               1621
Primary_Ferrite_%      1554
Ferrite_2nd_Phase_%    1562
Acicular_Ferrite_%     1562
Martensite_%           1563
Ferrite_Carbide_%      1563
Power_W                 248
dtype: int64

In [8]:
def missing_percentage(dataframe):
    """Calculate percentage of missing values for each column."""
    total = len(dataframe)
    missing = dataframe.isnull().sum()
    percent = (missing / total * 100).round(2)
    result = pd.DataFrame({'Missing_Count': missing, 'Missing_Percent': percent})
    return result[result['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)

missing_summary = missing_percentage(df)
print(missing_summary)

                     Missing_Count  Missing_Percent
FATT_50%                      1621            98.12
Martensite_%                  1563            94.61
Ferrite_Carbide_%             1563            94.61
Ferrite_2nd_Phase_%           1562            94.55
Acicular_Ferrite_%            1562            94.55
Primary_Ferrite_%             1554            94.07
Hardness_kg_mm2               1514            91.65
Elongation_%                   952            57.63
Reduction_Area_%               947            57.32
UTS_MPa                        914            55.33
Yield_Strength_MPa             872            52.78
Charpy_Temp_C                  773            46.79
Charpy_Energy_J                773            46.79
Power_W                        248            15.01
PWHT_Temp_C                     13             0.79
PWHT_Time_hours                 13             0.79


## Data Quality Assessment and Modeling Strategy

### Analysis of Missing Values

The dataset contains variables with different levels of completeness. Based on the amount of missing data, we will apply two distinct modeling approaches:

**Group 1: Supervised Learning with PCA (40-60% data available)**
- Yield Strength, UTS, Elongation, Reduction of Area, Charpy properties
- Sufficient labeled data (700-900 samples) for traditional supervised learning
- Strategy: Remove rows with missing targets, apply PCA for dimensionality reduction, train multiple models with GridSearchCV

**Group 2: Semi-Supervised Learning (< 10% data available)**
- Hardness, FATT, Microstructure phases
- Insufficient labeled data (31-138 samples) for standard supervised learning
- Strategy: Use self-training algorithms to leverage unlabeled data, gradually expand training set with high-confidence predictions


**Why separate approaches?**

1. **Statistical Power**: Group 1 properties have 700+ samples, sufficient for robust model training and validation (80/20 split gives 560+ training samples). Group 2 properties have < 140 samples, which would result in severe overfitting with traditional methods.

2. **Data Loss Impact**: Removing incomplete rows for Group 1 retains 40-60% of data. For Group 2, this would discard 90-98% of data, making modeling impossible.

3. **PCA Necessity**: With 45+ correlated features (chemical composition, welding parameters), dimensionality reduction prevents overfitting and improves generalization. This is critical for Group 1 where we have enough samples to reliably estimate principal components.

4. **Semi-Supervised Rationale**: For Group 2, the unlabeled samples contain valuable information about feature distributions. Self-training exploits this by iteratively adding confidently predicted samples to the training set, effectively increasing the labeled dataset size.

In [9]:
categorical_cols = [col for col in df.columns if 
                   col.startswith('Electrode_Polarity_') or 
                   col.startswith('Weld_Type_')]
all_numeric = df.select_dtypes(include=['float64', 'int64']).columns
continuous_features = [col for col in all_numeric if col not in categorical_cols]

df_continuous = df[continuous_features].copy()
df_categorical = df[categorical_cols].copy()

scaler = StandardScaler()
scaled_continuous = scaler.fit_transform(df_continuous)
df_scaled_continuous = pd.DataFrame(scaled_continuous, columns=continuous_features)

df_pca = pd.concat([df_scaled_continuous, df_categorical], axis=1)

# Save to CSV
df_pca.to_csv('welddatabase/welddb_pca.csv', index=False)

print(f"Continuous features scaled: {len(continuous_features)}")
print(f"Categorical features (unchanged): {len(categorical_cols)}")
print(f"Total features: {df_pca.shape[1]}")
print(f"Saved to: welddatabase/welddb_pca.csv")

Continuous features scaled: 39
Categorical features (unchanged): 13
Total features: 52
Saved to: welddatabase/welddb_pca.csv
