In [21]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, roc_curve, auc
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt

# read excel

In [22]:
# 讀sheet 1欄位名稱
df_columns = pd.read_excel("./MDS_Assignment3_Steelplates.xlsx", header=None)
columns = df_columns[0].tolist()

In [23]:
# 讀sheet 2，並加上欄位名稱
df = pd.read_excel("./MDS_Assignment3_Steelplates.xlsx", sheet_name="Faults", header=0)
df.columns = columns
df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
1,829,835,1553913,1553931,71,8,19,7972,99,125,...,0.6667,-0.1228,0.2150,1,0,0,0,0,0,0
2,853,860,369370,369415,176,13,45,18996,99,126,...,0.8444,-0.1568,0.5212,1,0,0,0,0,0,0
3,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0.9338,-0.1992,1.0000,1,0,0,0,0,0,0
4,430,441,100250,100337,630,20,87,62357,64,127,...,0.8736,-0.2267,0.9874,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1935,249,277,325780,325796,273,54,22,35033,119,141,...,-0.4286,0.0026,0.7254,0,0,0,0,0,0,1
1936,144,175,340581,340598,287,44,24,34599,112,133,...,-0.4516,-0.0582,0.8173,0,0,0,0,0,0,1
1937,145,174,386779,386794,292,40,22,37572,120,140,...,-0.4828,0.0052,0.7079,0,0,0,0,0,0,1
1938,137,170,422497,422528,419,97,47,52715,117,140,...,-0.0606,-0.0171,0.9919,0,0,0,0,0,0,1


# (a) show data summary 

In [24]:
df_summary = df.describe().T
df_summary.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
X_Minimum,1940.0,571.41,520.69,0.0,51.75,435.5,1053.0,1705.0
X_Maximum,1940.0,618.26,497.59,4.0,192.0,468.5,1072.25,1713.0
Y_Minimum,1940.0,1651396.1,1774759.22,6712.0,471779.5,1205078.0,2183491.25,12987661.0
Y_Maximum,1940.0,1651449.94,1774770.9,6724.0,471824.0,1205088.0,2183501.0,12987692.0
Pixels_Areas,1940.0,1894.72,5169.66,2.0,84.0,173.5,822.5,152655.0
X_Perimeter,1940.0,111.9,301.28,2.0,15.0,26.0,84.0,10449.0
Y_Perimeter,1940.0,82.99,426.59,1.0,13.0,25.0,83.0,18152.0
Sum_of_Luminosity,1940.0,206406.01,512408.98,250.0,9519.5,19195.5,83165.0,11591414.0
Minimum_of_Luminosity,1940.0,84.55,32.14,0.0,63.0,90.0,106.0,203.0
Maximum_of_Luminosity,1940.0,130.21,18.69,37.0,124.0,127.0,140.0,253.0


# (b) data problems

In [25]:
# find identical/redundant columns
corr_matrix = df.corr().round(5) 
highly_correlated_cols = []

for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            highly_correlated_cols.append((corr_matrix.columns[i], corr_matrix.columns[j]))
if len(highly_correlated_cols) > 0:
    print("Highly Correlated Column Pairs:")
    for col1, col2 in highly_correlated_cols:
        print(f"[{col1}, {col2}], corr = {corr_matrix.loc[col1, col2]}")
else:
    print("No highly correlated columns found.")

Highly Correlated Column Pairs:
[X_Minimum, X_Maximum], corr = 0.98831
[Y_Minimum, Y_Maximum], corr = 1.0
[Pixels_Areas, X_Perimeter], corr = 0.96664
[Pixels_Areas, Sum_of_Luminosity], corr = 0.97895
[X_Perimeter, Y_Perimeter], corr = 0.91245
[X_Perimeter, Sum_of_Luminosity], corr = 0.91295
[TypeOfSteel_A300, TypeOfSteel_A400], corr = -1.0


In [26]:
# missing values
missing_values = df.isnull().values.any() 
print(f"missing_values:{missing_values}\n")

missing_values:False



In [27]:
# drop redundant columns
df = df.drop(columns=['Pixels_Areas', 'X_Perimeter', 'Y_Maximum', 'X_Maximum', 'TypeOfSteel_A400'])

# (c) CART - imbalanced data

In [28]:
# combine seven dummy columns into one categorial column, then encode into int labels
df_class = df.iloc[:, -7:]
df['Fault'] = df.iloc[:, -7:].idxmax(axis=1)
df = df.drop(columns = columns[-7:], axis=0)

label_encoder = LabelEncoder()
df['Target'] = label_encoder.fit_transform(df['Fault'])
df

Unnamed: 0,X_Minimum,Y_Minimum,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,Steel_Plate_Thickness,Edges_Index,...,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Fault,Target
0,645,2538079,30,11397,84,123,1687,1,80,0.7647,...,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,Pastry,4
1,829,1553913,19,7972,99,125,1623,1,100,0.9710,...,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.2150,Pastry,4
2,853,369370,45,18996,99,126,1353,0,290,0.7287,...,1.0000,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,Pastry,4
3,1289,498078,260,246930,37,126,1353,0,185,0.0695,...,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0000,Pastry,4
4,430,100250,87,62357,64,127,1387,0,40,0.6200,...,1.0000,1.0,2.7993,1.0414,1.9395,0.8736,-0.2267,0.9874,Pastry,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1935,249,325780,22,35033,119,141,1360,0,40,0.3662,...,0.7273,0.0,2.4362,1.4472,1.2041,-0.4286,0.0026,0.7254,Other_Faults,3
1936,144,340581,24,34599,112,133,1360,0,40,0.2118,...,0.7083,0.0,2.4579,1.4914,1.2305,-0.4516,-0.0582,0.8173,Other_Faults,3
1937,145,386779,22,37572,120,140,1360,0,40,0.2132,...,0.6818,0.0,2.4654,1.4624,1.1761,-0.4828,0.0052,0.7079,Other_Faults,3
1938,137,422497,47,52715,117,140,1360,0,40,0.2015,...,0.6596,0.0,2.6222,1.5185,1.4914,-0.0606,-0.0171,0.9919,Other_Faults,3


In [29]:
# data split 
X = df.iloc[:, :-2]
y = df.iloc[:,-1:]

In [30]:
def cart_model(X,y):
    # model train
    clf = DecisionTreeClassifier(random_state=42)
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # Set hyperparameters
    hyperparameters = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}

    for max_depth in hyperparameters['max_depth']:
        for min_samples_split in hyperparameters['min_samples_split']:  
            
            clf.set_params(max_depth=max_depth, min_samples_split=min_samples_split)

            # Initialize lists to store metric results
            accuracy_scores = []
            f1_scores = []

            # Cross-validation loop
            for train_index, test_index in kf.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                
                # Calculate and store accuracy, AUC, and F1-score
                accuracy_scores.append(accuracy_score(y_test, y_pred).round(5))
                f1_scores.append(f1_score(y_test, y_pred, average='weighted').round(5))
            
            # # Calculate and print the mean of metrics across folds
            # print(f"Max Depth:{max_depth}, Min samples split:{min_samples_split}")

            # mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
            # print(f"Mean Accuracy: {mean_accuracy:.5f}")

            # mean_f1 = sum(f1_scores) / len(f1_scores)
            # print(f"Mean F1 Score: {mean_f1:.5f}")

            # print("========================================================")
    return accuracy_scores,f1_scores

In [31]:
acc, f1 = cart_model(X,y)
print(f"accuracy scores:{acc}")
print(f"f1 scores:{f1}")

accuracy scores:[0.73196, 0.71134, 0.7732, 0.7268, 0.73711, 0.68041, 0.7268, 0.71134, 0.71134, 0.71649]
f1 scores:[0.73172, 0.71535, 0.77624, 0.73098, 0.7383, 0.67342, 0.72671, 0.71333, 0.72159, 0.71782]


# (d) data undersampling


In [33]:
df_class.sum()

Pastry          157
Z_Scratch       190
K_Scatch        391
Stains           72
Dirtiness        55
Bumps           402
Other_Faults    673
dtype: int64

In [54]:
# Apply undersampling to the dataset
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# (e) CART - balanced data

In [55]:
acc_resampled, f1_resampled = cart_model(X_resampled, y_resampled)
print(f"accuracy scores:{acc_resampled}")
print(f"f1 scores:{f1_resampled}")

accuracy scores:[0.83459, 0.81955, 0.80303, 0.81818, 0.75758, 0.78788, 0.82576, 0.82576, 0.81818, 0.81061]
f1 scores:[0.83186, 0.81398, 0.80922, 0.81862, 0.77101, 0.79415, 0.82746, 0.81312, 0.82512, 0.80215]


# (g) Random Forest

In [56]:
def random_forest_model(X,y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the classifier on the training data
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the performance of the classifier
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    return accuracy, conf_matrix, classification_rep

In [57]:
# imbalanced data
accuracy_RF, conf_matrix_RF, classification_rep_RF = random_forest_model(X,y)
print(f"accuracy:{accuracy_RF.round(3)}")
print(classification_rep_RF)

  return fit_method(estimator, *args, **kwargs)


accuracy:0.76
              precision    recall  f1-score   support

           0       0.63      0.67      0.65        69
           1       0.70      0.88      0.78         8
           2       0.97      0.93      0.95        83
           3       0.68      0.73      0.71       145
           4       0.62      0.45      0.52        29
           5       0.85      0.85      0.85        13
           6       0.95      0.85      0.90        41

    accuracy                           0.76       388
   macro avg       0.77      0.76      0.76       388
weighted avg       0.77      0.76      0.76       388



In [58]:
# balanced data
accuracy_RF_resampled, conf_matrix_RF_resampled, classification_rep_RF_resampled = random_forest_model(X_resampled,y_resampled)
print(f"accuracy:{accuracy_RF_resampled.round(3)}")
print(classification_rep_RF_resampled)

  return fit_method(estimator, *args, **kwargs)


accuracy:0.891
              precision    recall  f1-score   support

           0       0.82      0.95      0.88        84
           1       1.00      0.75      0.86        12
           2       0.97      0.99      0.98        88
           3       1.00      0.08      0.14        13
           4       0.76      0.80      0.78        20
           5       0.94      0.94      0.94        16
           6       0.90      0.88      0.89        32

    accuracy                           0.89       265
   macro avg       0.91      0.77      0.78       265
weighted avg       0.90      0.89      0.87       265



# (h) Gradient Boosting Decision Tree

In [59]:
def GBDT_model(X,y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Gradient Boosting Decision Tree classifier
    gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

    # Train the classifier on the training data
    gb_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = gb_classifier.predict(X_test)

    # Evaluate the performance of the classifier
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    return accuracy, conf_matrix, classification_rep
    

In [52]:
accuracy_GBDT, conf_matrix_GBDT, classification_rep_GBDT = GBDT_model(X,y)
print(f"accuracy:{accuracy_GBDT.round(3)}")
print(classification_rep_GBDT)

  y = column_or_1d(y, warn=True)


accuracy:0.786
              precision    recall  f1-score   support

           0       0.66      0.74      0.70        69
           1       0.67      0.75      0.71         8
           2       0.95      0.96      0.96        83
           3       0.75      0.74      0.74       145
           4       0.57      0.45      0.50        29
           5       1.00      0.85      0.92        13
           6       0.90      0.90      0.90        41

    accuracy                           0.79       388
   macro avg       0.79      0.77      0.77       388
weighted avg       0.79      0.79      0.78       388



In [53]:
accuracy_GBDT_resampled, conf_matrix_GBDT_resampled, classification_rep_GBDT_resampled = GBDT_model(X_resampled,y_resampled)
print(f"accuracy:{accuracy_GBDT_resampled.round(3)}")
print(classification_rep_GBDT_resampled)

  y = column_or_1d(y, warn=True)


accuracy:0.766
              precision    recall  f1-score   support

           0       0.83      0.67      0.74        15
           1       1.00      0.80      0.89        15
           2       0.75      0.92      0.83        13
           3       0.40      0.57      0.47         7
           4       0.86      0.55      0.67        11
           5       0.71      0.83      0.77         6
           6       0.77      1.00      0.87        10

    accuracy                           0.77        77
   macro avg       0.76      0.76      0.75        77
weighted avg       0.80      0.77      0.77        77

