In [2]:
import csv
import re  
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

In [4]:
headers = [
    "Carbon concentration (weight%)", "Silicon concentration (weight%)",
    "Manganese concentration (weight%)", "Sulphur concentration (weight%)",
    "Phosphorus concentration (weight%)", "Nickel concentration (weight%)",
    "Chromium concentration (weight%)", "Molybdenum concentration (weight%)",
    "Vanadium concentration (weight%)", "Copper concentration (weight%)",
    "Cobalt concentration (weight%)", "Tungsten concentration (weight%)",
    "Oxygen concentration (ppm)", "Titanium concentration (ppm)",
    "Nitrogen concentration (ppm)", "Aluminium concentration (ppm)",
    "Boron concentration (ppm)", "Niobium concentration (ppm)",
    "Tin concentration (ppm)", "Arsenic concentration (ppm)",
    "Antimony concentration (ppm)", "Current (A)", "Voltage (V)",
    "AC or DC", "Electrode positive or negative", "Heat input (kJ/mm)",
    "Interpass temperature (deg C)", "Type of weld",
    "Post weld heat treatment temperature (deg C)", "Post weld heat treatment time (hours)",
    "Yield strength (MPa)", "Ultimate tensile strength (MPa)",
    "Elongation (%)", "Reduction of Area (%)",
    "Charpy temperature (deg C)", "Charpy impact toughness (J)",
    "Hardness (kg/mm2)", "50 % FATT",
    "Primary ferrite in microstructure (%)", "Ferrite with second phase (%)",
    "Acicular ferrite (%)", "Martensite (%)",
    "Ferrite with carbide aggregate (%)", "Weld ID"
]
def extract_data(data_file):
    data = []
    with open(data_file, 'r') as file:
        for line in file:
            row = line.strip().split()
            row = [float(item) if item.replace('.', '', 1).isdigit() else item for item in row]
            data.append(row)
    return data
data_file = '../1 - InformationsPubliques-Données/welddb.data'
data = extract_data(data_file)
with open('../4 - Dataset/Before_Weld_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(headers)
    writer.writerows(data)

In [5]:
df = pd.read_csv('../4 - Dataset/Before_Weld_data.csv')

In [6]:
df.head()

Unnamed: 0,Carbon concentration (weight%),Silicon concentration (weight%),Manganese concentration (weight%),Sulphur concentration (weight%),Phosphorus concentration (weight%),Nickel concentration (weight%),Chromium concentration (weight%),Molybdenum concentration (weight%),Vanadium concentration (weight%),Copper concentration (weight%),...,Charpy temperature (deg C),Charpy impact toughness (J),Hardness (kg/mm2),50 % FATT,Primary ferrite in microstructure (%),Ferrite with second phase (%),Acicular ferrite (%),Martensite (%),Ferrite with carbide aggregate (%),Weld ID
0,0.037,0.3,0.65,0.008,0.012,0.0,N,N,N,N,...,N,N,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aaw
1,0.037,0.3,0.65,0.008,0.012,0.0,N,N,N,N,...,-28,100.0,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aawch
2,0.037,0.3,0.65,0.008,0.012,0.0,N,N,N,N,...,-38,100.0,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aht
3,0.037,0.31,1.03,0.007,0.014,0.0,N,N,N,N,...,N,N,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Baw
4,0.037,0.31,1.03,0.007,0.014,0.0,N,N,N,N,...,-48,100.0,N,N,32.0,28.0,40.0,0.0,0.0,Evans-Ni/CMn-1990/1991-0Bawch


In [7]:
df.shape

(1652, 44)

### Convert to Numerical

In [9]:
weld_type_mapping = {
    "MMA": 0,
    "SA": 1,
    "FCA": 2,
    "TSA": 3,
    "ShMA": 4,
    "NGSAW": 5,
    "NGGMA": 6,
    "GMAA": 7,
    "GTAA": 8,
    "SAA": 9
}

headers = [
    "Carbon concentration (weight%)", "Silicon concentration (weight%)",
    "Manganese concentration (weight%)", "Sulphur concentration (weight%)",
    "Phosphorus concentration (weight%)", "Nickel concentration (weight%)",
    "Chromium concentration (weight%)", "Molybdenum concentration (weight%)",
    "Vanadium concentration (weight%)", "Copper concentration (weight%)",
    "Cobalt concentration (weight%)", "Tungsten concentration (weight%)",
    "Oxygen concentration (ppm)", "Titanium concentration (ppm)",
    "Nitrogen concentration (ppm)", "Aluminium concentration (ppm)",
    "Boron concentration (ppm)", "Niobium concentration (ppm)",
    "Tin concentration (ppm)", "Arsenic concentration (ppm)",
    "Antimony concentration (ppm)", "Current (A)", "Voltage (V)",
    "AC or DC", "Electrode positive or negative", "Heat input (kJ/mm)",
    "Interpass temperature (deg C)", "Type of weld",
    "Post weld heat treatment temperature (deg C)", "Post weld heat treatment time (hours)",
    "Yield strength (MPa)", "Ultimate tensile strength (MPa)",
    "Elongation (%)", "Reduction of Area (%)",
    "Charpy temperature (deg C)", "Charpy impact toughness (J)",
    "Hardness (kg/mm2)", "50 % FATT",
    "Primary ferrite in microstructure (%)", "Ferrite with second phase (%)",
    "Acicular ferrite (%)", "Martensite (%)",
    "Ferrite with carbide aggregate (%)"
]

def extract_alloy_and_year(weld_id):
    alloy_pattern = r'-(B/)?([A-Za-z0-9/]+)-\d{4}'  
    year_pattern = r'(\d{4}/\d{4}|\d{4})'  
    
    specific_alloy_pattern = r'\+([0-9]*\.?[0-9]+(?:Cu|Ni|Mo|Al|Fe|Cr|Zn|Sn|Ti|Mg))'  

    alloy = np.nan
    year = np.nan
    specific_alloy = np.nan  

    alloy_match = re.search(alloy_pattern, weld_id)
    if alloy_match:
        alloy = alloy_match.group(2)  

        invalid_terms = ['FelowH', 'Di', 'intpss', 'TR', 'StressRelief', 'MAX39', 'E7016', '0.045A']
        
        if alloy in invalid_terms or alloy.isdigit():
            alloy = np.nan 

    specific_alloy_match = re.search(specific_alloy_pattern, weld_id)
    if specific_alloy_match:
        specific_alloy = specific_alloy_match.group(1)  

    year_match = re.search(year_pattern, weld_id)
    if year_match:
        year_str = year_match.group(0)  
        
        if int(year_str[:4]) > 2023:  
            year = np.nan  
        else:
            year = year_str

    return specific_alloy if pd.notna(specific_alloy) else alloy, year

def extract_data(data_file):
    data = []
    with open(data_file, 'r') as file:
        for line in file:
            row = line.strip().split()
            row = [
                float(item) if item.replace('.', '', 1).isdigit() else -1 if item == 'N' else item 
                for item in row
            ]

            interpass_temp = row[26]
            if isinstance(interpass_temp, str) and '-' in interpass_temp:
                lower_bound, upper_bound = map(float, interpass_temp.split('-'))
                row[26] = (lower_bound + upper_bound) / 2  # Replace with midpoint

            for index in [3, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 38]:  # Adjust index as necessary
                if isinstance(row[index], str) and row[index].startswith('<'):
                    row[index] = float(row[index][1:])

            nitrogen_concentration = row[14]  
            if isinstance(nitrogen_concentration, str):
                match = re.match(r'(\d+)', nitrogen_concentration)
                if match:
                    nitrogen_concentration = float(match.group(1))
                else:
                    nitrogen_concentration = -1  
            row[14] = nitrogen_concentration 

            weld_type = row[27]
            if weld_type in weld_type_mapping:
                row[27] = weld_type_mapping[weld_type]  
            else:
                continue  

            hardness = row[36]  
            if isinstance(hardness, str):
                match = re.match(r'(\d+)', hardness)
                if match:
                    hardness = float(match.group(1))
                else:
                    hardness = -1 
            row[36] = hardness  

            if row[24] == '+':
                row[24] = 1
            elif row[24] == '-':
                row[24] = 2
            else:
                row[24] = 0
            
            if row[23] == 'AC':
                row[23] = 1  # Replace "AC" with 1
            elif row[23] == 'DC':
                row[23] = 0  # Replace "DC" with 0
            
            if row[24] != '' and len(row) == len(headers) + 1:  
                weld_id = str(row[-1])  
                alloy, year = extract_alloy_and_year(weld_id)

                row.append(alloy)  # Alloy Type
                row.append(year)   # Year

                data.append(row)  

    return data

data_file = '../1 - InformationsPubliques-Données/welddb.data'

data = extract_data(data_file)

with open('../4 - Dataset/weld_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    writer.writerow(headers + ["Weld_ID","Alloy Type", "Year"])

    writer.writerows(data)

df = pd.read_csv('../4 - Dataset/weld_data.csv')
print(df.head())



   Carbon concentration (weight%)  Silicon concentration (weight%)  \
0                           0.037                             0.30   
1                           0.037                             0.30   
2                           0.037                             0.30   
3                           0.037                             0.31   
4                           0.037                             0.31   

   Manganese concentration (weight%)  Sulphur concentration (weight%)  \
0                               0.65                            0.008   
1                               0.65                            0.008   
2                               0.65                            0.008   
3                               1.03                            0.007   
4                               1.03                            0.007   

   Phosphorus concentration (weight%)  Nickel concentration (weight%)  \
0                               0.012                             0

### Shuffle

In [43]:
df = df.sample(frac=1).reset_index(drop=True)

df.shape

(1652, 46)

In [44]:
df.head()

Unnamed: 0,Carbon concentration (weight%),Silicon concentration (weight%),Manganese concentration (weight%),Sulphur concentration (weight%),Phosphorus concentration (weight%),Nickel concentration (weight%),Chromium concentration (weight%),Molybdenum concentration (weight%),Vanadium concentration (weight%),Copper concentration (weight%),...,Hardness (kg/mm2),50 % FATT,Primary ferrite in microstructure (%),Ferrite with second phase (%),Acicular ferrite (%),Martensite (%),Ferrite with carbide aggregate (%),Weld_ID,Alloy Type,Year
0,0.1,0.35,1.54,0.009,0.016,0.06,0.049,0.18,0.057,0.11,...,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,Gar&K-1975-12mm-21aw,,1975.0
1,0.045,0.3,0.99,0.008,0.01,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,Evans-intpss-1978-B300,,1978.0
2,0.073,0.35,0.64,0.005,0.004,0.0,0.0,0.0,0.0005,0.0,...,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,EvansLetter-A+ch1,,
3,0.1,0.42,0.91,0.004,0.008,0.17,8.9,0.98,0.21,0.03,...,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,PantK-1990-bw4.3,,1990.0
4,0.076,0.24,1.41,0.008,0.006,0.0,0.0,1.11,0.0005,0.0,...,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,EvansLetterCo+1Moch2,1Mo,
