# Data Preparation
This notebook defines the targets, the columns to be dropped and for each target column, creates two splits (labeled and unlabeled) and uploads the whole splits to Hugging Face datasets because our goal is to test each target separately.

In [1]:
import sys
assert sys.version_info >= (3, 5)
import sklearn
assert sklearn.__version__ >= "0.20"
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt
# Style options for plots.
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998).
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

### Loading data
We load the cleaned data localy. If you don't have the csv please run the notebook `01.dataset_cleaning.ipynb`

In [2]:
from sklearn.utils import shuffle

# Load the dataset
weld_df = pd.read_csv("../../data/clean_weld_quality_dataset.csv")

# Shuffle the data to avoid bias when deleting labels
weld_df = shuffle(weld_df, random_state=1)

Let's take a look at the columns

In [3]:
weld_df.columns

Index(['carbon_wt_pct', 'silicon_wt_pct', 'manganese_wt_pct', 'sulfur_wt_pct',
       'phosphorus_wt_pct', 'nickel_wt_pct', 'chromium_wt_pct',
       'molybdenum_wt_pct', 'vanadium_wt_pct', 'copper_wt_pct',
       'cobalt_wt_pct', 'tungsten_wt_pct', 'oxygen_ppm', 'titanium_ppm',
       'nitrogen_ppm', 'aluminium_ppm', 'boron_ppm', 'niobium_ppm', 'tin_ppm',
       'arsenic_ppm', 'antimony_ppm', 'current_A', 'voltage_V',
       'heat_input_kJmm', 'interpass_temp_C', 'pwht_temp_C', 'pwht_time_h',
       'yield_strength_MPa', 'uts_MPa', 'elongation_pct', 'reduction_area_pct',
       'charpy_temp_C', 'charpy_toughness_J', 'hardness_kgmm2', 'fatt50_C',
       'primary_ferrite_pct', 'ferrite_second_phase_pct',
       'acicular_ferrite_pct', 'martensite_pct', 'ferrite_carbide_pct',
       'weld_type_FCA', 'weld_type_GMAA', 'weld_type_GTAA', 'weld_type_MMA',
       'weld_type_NGGMA', 'weld_type_NGSAW', 'weld_type_SA', 'weld_type_SAA',
       'weld_type_ShMA', 'weld_type_TSA', 'current_type_AC',

### Defining target cols

In [66]:
TARGET_COLS = [
    "yield_strength_MPa",  # Stress at which plastic deformation begins; measures material strength.
    "uts_MPa",             # Ultimate tensile strength; maximum stress material can withstand before fracture.
    "elongation_pct",      # Percent elongation; measure of ductility (total strain before fracture).
    "reduction_area_pct",  # Percent reduction in cross-sectional area after fracture; another ductility measure.
    "charpy_temp_C",       # Test temperature for Charpy impact test; defines testing condition.
    "charpy_toughness_J", ] # Charpy impact energy absorbed; indicates toughness and resistance to brittle fracture.
# Columns to be dropped from the dataset because too much missing values
TODROP = [
    "hardness_kgmm2",      # Surface hardness; correlates with strength and wear resistance.
    "fatt50_C"             # 50% Fracture Appearance Transition Temperature; temperature where 50% brittle fracture occurs.
]


In [None]:

new_df = weld_df.drop(columns=TODROP)

### Push dataset to HF

In [6]:
from datasets import Dataset
from huggingface_hub import notebook_login

In [62]:
notebook_login() # use your hf token here on run hf auth login in your terminal

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [14]:
hf_dataset = Dataset.from_pandas(new_df)
hf_dataset.push_to_hub("moSBAIHI/weld-quality-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/MoSBAIHI/weld-quality-dataset/commit/6525bba618ce8699fb1205bf93c1eb15a45dac6e', commit_message='Upload dataset', commit_description='', oid='6525bba618ce8699fb1205bf93c1eb15a45dac6e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/MoSBAIHI/weld-quality-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='MoSBAIHI/weld-quality-dataset'), pr_revision=None, pr_num=None)

### Load Dataset from Hugging Face
If you encounter any errors in the previous cells, it's fine. We load the dataset from HF again

In [64]:
from huggingface_hub import hf_hub_download
import pandas as pd

path = hf_hub_download(
    repo_id="MoSBAIHI/weld-quality-dataset",
    filename="data/train-00000-of-00001.parquet",
    repo_type="dataset"
)

df = pd.read_parquet(path)

### Target Splits
Let's create for each target, two sets (labeled & unlabeled) that we can later use to create our train, val, test splits when exploring the target separately.

In [69]:
def name_column(col_name, labeled=True):
    if labeled:
        return f"{col_name}_labeled"
    else:
        return f"{col_name}_unlabeled"

dataset_splits={}
print("Number of total examples:", len(df), "\n")
for y in TARGET_COLS:
    
    y = df[y]
    print(y.name)
    labeled_mask = ~y.isna()
    num_labeled = labeled_mask.sum()
    dataset_splits[name_column(y.name, labeled=True)] = Dataset.from_pandas(df[labeled_mask].reset_index(drop=True))
    dataset_splits[name_column(y.name, labeled=False)] = Dataset.from_pandas(df[~labeled_mask].reset_index(drop=True))
    print(f"Number of labeled examples for {y.name}: {num_labeled}")
    print(f"Number of unlabeled examples for {y.name}: {len(y) - num_labeled}\n")


Number of total examples: 1652 

yield_strength_MPa
Number of labeled examples for yield_strength_MPa: 780
Number of unlabeled examples for yield_strength_MPa: 872

uts_MPa
Number of labeled examples for uts_MPa: 738
Number of unlabeled examples for uts_MPa: 914

elongation_pct
Number of labeled examples for elongation_pct: 700
Number of unlabeled examples for elongation_pct: 952

reduction_area_pct
Number of labeled examples for reduction_area_pct: 705
Number of unlabeled examples for reduction_area_pct: 947

charpy_temp_C
Number of labeled examples for charpy_temp_C: 879
Number of unlabeled examples for charpy_temp_C: 773

charpy_toughness_J
Number of labeled examples for charpy_toughness_J: 879
Number of unlabeled examples for charpy_toughness_J: 773



### Push Splits to HF

In [44]:
from datasets import DatasetDict
dataset_dict = DatasetDict(dataset_splits)
dataset_dict.push_to_hub("moSBAIHI/weld-quality-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/MoSBAIHI/weld-quality-dataset/commit/9fa7cf7a4551cdec9efe373e1af092295432e713', commit_message='Upload dataset', commit_description='', oid='9fa7cf7a4551cdec9efe373e1af092295432e713', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/MoSBAIHI/weld-quality-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='MoSBAIHI/weld-quality-dataset'), pr_revision=None, pr_num=None)

In [58]:
# example: how to load a specific split
from datasets import load_dataset
SPLITS = [name_column(col, labeled=True) for col in TARGET_COLS] + [name_column(col, labeled=False) for col in TARGET_COLS]
print(SPLITS)
yield_strength_labeled = load_dataset("MoSBAIHI/weld-quality-dataset", split=SPLITS[0])
yield_strength_labeled = yield_strength_labeled.to_pandas()
yield_strength_unlabeled = load_dataset("MoSBAIHI/weld-quality-dataset", split=SPLITS[6])
yield_strength_unlabeled = yield_strength_unlabeled.to_pandas()

['yield_strength_MPa_labeled', 'uts_MPa_labeled', 'elongation_pct_labeled', 'reduction_area_pct_labeled', 'charpy_temp_C_labeled', 'charpy_toughness_J_labeled', 'yield_strength_MPa_unlabeled', 'uts_MPa_unlabeled', 'elongation_pct_unlabeled', 'reduction_area_pct_unlabeled', 'charpy_temp_C_unlabeled', 'charpy_toughness_J_unlabeled']
