In [1]:
pip install pandas numpy matplotlib scikit-learn





In [2]:
import pandas as pd

df = pd.read_csv("../data/ai4i2020.csv")   # adjust path if needed
df.head()


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [3]:
print(df.shape)
print(df.dtypes)
print(df.isna().sum())
print("Duplicates:", df.duplicated().sum())


(10000, 14)
UDI                          int64
Product ID                  object
Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Machine failure              int64
TWF                          int64
HDF                          int64
PWF                          int64
OSF                          int64
RNF                          int64
dtype: object
UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64
Duplicates: 0


In [4]:
df["Machine failure"].value_counts(normalize=True) * 100


Machine failure
0    96.61
1     3.39
Name: proportion, dtype: float64

In [5]:
id_cols = ["UDI", "Product ID"]
leak_cols = ["TWF", "HDF", "PWF", "OSF", "RNF"]
df = df.drop(columns=id_cols + leak_cols)
df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure
0,M,298.1,308.6,1551,42.8,0,0
1,L,298.2,308.7,1408,46.3,3,0
2,L,298.1,308.5,1498,49.4,5,0
3,L,298.2,308.6,1433,39.5,7,0
4,L,298.2,308.7,1408,40.0,9,0


In [6]:
df["Temp diff [K]"] = df["Process temperature [K]"] - df["Air temperature [K]"]

In [7]:
import numpy as np

num_cols = df.select_dtypes(include=[np.number]).columns.drop("Machine failure")

for c in num_cols:
    low = np.percentile(df[c], 1)
    high = np.percentile(df[c], 99)
    df[c] = np.clip(df[c], low, high)


In [8]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

In [9]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

cat_cols = ["Type"]
num_cols = df.select_dtypes(include=[np.number]).columns.drop("Machine failure")

preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", StandardScaler(), num_cols)
])

In [10]:
df.to_csv("ai4i2020_cleaned.csv", index=False)


In [14]:
import sklearn
print(sklearn.__version__)


1.6.1
