In [566]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [461]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

features = df_concat.drop(columns=["target"])
categorical_columns = [
    'Equipment_Dam', 
    #'Model.Suffix_Dam',
    #'Workorder_Dam',
    'Equipment_Fill1', 
    'Equipment_Fill2'
]
encoded_features = pd.get_dummies(features, columns=categorical_columns)
# 수치형 열에 대해 정규화 또는 표준화 적용
numeric_columns = [col for col in encoded_features.columns if col not in pd.get_dummies(df_concat[categorical_columns]).columns]
scaler = StandardScaler()  # 또는 StandardScaler() 사용 가능
scaled_numeric_features = scaler.fit_transform(encoded_features[numeric_columns])

# 스케일링된 수치형 데이터를 다시 DataFrame으로 변환
scaled_numeric_df = pd.DataFrame(scaled_numeric_features, columns=numeric_columns)

# 원핫 인코딩된 범주형 데이터와 스케일링된 수치형 데이터를 합침
final_encoded_df = pd.concat([scaled_numeric_df, encoded_features.drop(columns=numeric_columns)], axis=1)

# target 열을 다시 합쳐서 최종 데이터셋 생성
final_df = pd.concat([final_encoded_df, df_concat['target']], axis=1)

In [462]:
encoded_features.head()

Unnamed: 0,CURE SPEED Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage3) Collect Result_Dam,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam,HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam,HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam,...,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2,HEAD Standby Position X Collect Result_Fill2,HEAD Standby Position Y Collect Result_Fill2,Machine Tact time Collect Result_Fill2,Equipment_Dam_Dam dispenser #1,Equipment_Dam_Dam dispenser #2,Equipment_Fill1_Fill1 dispenser #1,Equipment_Fill1_Fill1 dispenser #2,Equipment_Fill2_Fill2 dispenser #1,Equipment_Fill2_Fill2 dispenser #2
0,70,16,13.2,8.3,13.2,0.92,1271.8,1383.9,1271.8,281.43,...,1324.2,243.5,270,85.0,False,True,False,True,False,True
1,70,10,9.7,4.9,9.7,0.67,377.0,377.5,284.8,284.8,...,427.9,270.0,50,19.7,True,False,True,False,True,False
2,70,10,17.0,4.9,17.1,1.19,377.3,377.3,282.15,282.15,...,427.9,270.0,50,19.4,True,False,True,False,True,False
3,70,10,9.6,3.9,9.6,0.67,377.0,377.5,284.8,284.8,...,427.9,270.0,50,18.6,True,False,True,False,True,False
4,70,16,13.2,8.3,13.2,0.92,1271.8,1383.9,1271.8,281.424,...,1324.2,243.5,270,85.0,False,True,False,True,False,True


In [463]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
df_concat["weight"] = pca.fit_transform(df_concat_weight[weight_columns])

In [561]:
correlation = df_concat_weight.corr()
correlation.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Equipment_Dam_weight,Model.Suffix_Dam_weight,Workorder_Dam_weight,CURE SPEED Collect Result_Dam_weight,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam_weight,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam_weight,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam_weight,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam_weight,THICKNESS 2 Collect Result_Dam_weight,1st Pressure 1st Pressure Unit Time_AutoClave_weight,2nd Pressure Collect Result_AutoClave_weight,2nd Pressure Unit Time_AutoClave_weight,3rd Pressure Collect Result_AutoClave_weight,3rd Pressure Unit Time_AutoClave_weight,Chamber Temp. Unit Time_AutoClave_weight,DISCHARGED SPEED OF RESIN Collect Result_Fill1_weight,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1_weight,HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1_weight,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1_weight,HEAD Standby Position X Collect Result_Fill1_weight,PalletID Collect Result_Fill1_weight,CURE SPEED Collect Result_Fill2_weight,CURE STANDBY POSITION Z Collect Result_Fill2_weight,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2_weight,Machine Tact time Collect Result_Fill2_weight
Equipment_Dam_weight,1.0,-0.030421,0.149282,0.05424,0.354328,0.32306,0.236073,0.371363,0.070186,0.379773,0.218811,0.417916,0.116888,0.394487,0.386125,0.322943,0.309445,0.256445,0.137632,0.230888,0.199989,0.09984,0.314142,0.345904,0.331044
Model.Suffix_Dam_weight,-0.030421,1.0,0.117836,0.297678,0.10458,0.003305,-0.012369,0.090489,0.02304,-0.059867,0.027559,-0.025363,0.014496,-0.06049,-0.037677,-0.054014,-0.084449,0.009903,0.145102,0.129622,0.064568,-0.016581,-0.021328,-0.001271,0.032342
Workorder_Dam_weight,0.149282,0.117836,1.0,0.15451,0.349206,0.31758,0.334918,0.366273,0.099003,0.288154,0.249693,0.339902,0.191549,0.308101,0.354706,0.205244,0.242803,0.33201,0.27706,0.303051,0.250572,0.082678,0.159668,0.274902,0.299849
CURE SPEED Collect Result_Dam_weight,0.05424,0.297678,0.15451,1.0,0.359665,0.322975,0.200648,0.382668,0.164293,0.116791,0.198515,0.233502,0.116812,0.120147,0.180238,0.095653,-0.022293,0.291628,0.496853,0.458556,0.224106,0.029363,0.110923,0.232299,0.251827
DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam_weight,0.354328,0.10458,0.349206,0.359665,1.0,0.722652,0.729356,0.720353,0.237922,0.520181,0.498151,0.716956,0.226925,0.567976,0.647675,0.463679,0.613274,0.72784,0.563403,0.698268,0.463397,0.240495,0.502136,0.679442,0.665383
DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam_weight,0.32306,0.003305,0.31758,0.322975,0.722652,1.0,0.685792,0.710082,0.263171,0.482763,0.457773,0.68185,0.187397,0.476438,0.636321,0.465971,0.635696,0.716306,0.68491,0.747678,0.511015,0.158218,0.483785,0.650932,0.631973
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam_weight,0.236073,-0.012369,0.334918,0.200648,0.729356,0.685792,1.0,0.749913,0.188439,0.609116,0.494873,0.791318,0.220549,0.629994,0.729706,0.511364,0.690948,0.830717,0.517909,0.661436,0.451549,0.158206,0.509644,0.661546,0.689379
HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam_weight,0.371363,0.090489,0.366273,0.382668,0.720353,0.710082,0.749913,1.0,0.209985,0.582632,0.453031,0.758747,0.232912,0.595911,0.703068,0.506657,0.595974,0.78113,0.691589,0.785636,0.480932,0.15652,0.47561,0.723019,0.668697
THICKNESS 2 Collect Result_Dam_weight,0.070186,0.02304,0.099003,0.164293,0.237922,0.263171,0.188439,0.209985,1.0,0.117111,0.155489,0.19478,0.195978,0.121639,0.169617,0.09064,0.181434,0.189333,0.350527,0.323509,0.134915,0.027824,0.105109,0.209927,0.21633
1st Pressure 1st Pressure Unit Time_AutoClave_weight,0.379773,-0.059867,0.288154,0.116791,0.520181,0.482763,0.609116,0.582632,0.117111,1.0,0.339731,0.779149,0.204033,0.879126,0.75889,0.642961,0.465545,0.645988,0.256485,0.378638,0.262662,0.213968,0.5276,0.516843,0.538285


In [1027]:
df_concat_weight = df_concat_weight.drop(["Equipment_Fill2_weight", "Equipment_Fill1_weight", "THICKNESS 1 Collect Result_Dam_weight", "HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2_weight", "HEAD Standby Position X Collect Result_Fill2_weight", "HEAD Standby Position Y Collect Result_Fill2_weight"], axis=1)

In [466]:
df_concat_weight = df_concat_weight.drop(["Dispense Volume(Stage3) Collect Result_Dam_weight", "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam_weight", "Head Purge Position Z Collect Result_Fill1_weight"], axis=1)

In [468]:
df_concat_weight = df_concat_weight.drop(["Head Zero Position Y Collect Result_Dam_weight", "Stage1 Line1 Distance Speed Collect Result_Dam_weight"], axis=1)

In [470]:
df_concat_weight = df_concat_weight.drop(["DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1_weight", "Head Clean Position Z Collect Result_Dam_weight"], axis=1)

In [472]:
df_concat_weight = df_concat_weight.drop(["Stage2 Circle1 Distance Speed Collect Result_Dam_weight", "Stage2 Circle2 Distance Speed Collect Result_Dam_weight"], axis=1)

In [539]:
df_concat_weight = df_concat_weight.drop(["Stage2 Line4 Distance Speed Collect Result_Dam_weight", "Stage3 Circle1 Distance Speed Collect Result_Dam_weight"], axis=1)

In [541]:
df_concat_weight = df_concat_weight.drop(["DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam_weight", "Stage1 Circle2 Distance Speed Collect Result_Dam_weight"], axis=1)

In [558]:
df_concat_weight = df_concat_weight.drop(["HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam_weight", "DISCHARGED SPEED OF RESIN Collect Result_Dam_weight"], axis=1)

In [560]:
df_concat_weight = df_concat_weight.drop(["Stage2 Line3 Distance Speed Collect Result_Dam_weight"], axis=1)

In [1028]:
from sklearn.model_selection import train_test_split
y = final_df["target"]
X = df_concat_weight
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [548]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.7054794520547946


In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
    'n_estimators' : range(100, 1000, 100),
    'max_depth' : range(3, 15, 2),
    'min_samples_leaf' : range(1, 15, 2),
    'min_samples_split' : range(2, 15, 2)
}

In [None]:
clf = RandomForestClassifier(random_state=100) 


########################## 문제 3-1 ##########################
########################## 코드 작성 ##########################
#그리드 서치 함수 실행
gs = GridSearchCV(estimator=clf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5,
                  n_jobs=-1
                  )
# 모델 학습
gs = gs.fit(X_train, y_train)

best_parameters = gs.best_params_ # 높은 성능 파라미터
best_score = gs.best_score_ # 베스트 스코어

############################################################

print(f"Best parameters: {best_parameters}")
print(f"Best score: {best_score}")

Best parameters: {'max_depth': 11, 'min_samples_leaf': 13, 'min_samples_split': 2, 'n_estimators': 200}
Best score: 0.6729647833976358


In [549]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth = 5, random_state=200, min_samples_leaf=1, min_samples_split=2)
ada = AdaBoostClassifier(estimator=tree, n_estimators=100, learning_rate=0.0001, random_state=100)
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('결정 트리의 훈련 정확도/테스트 정확도 %.3f/%.3f' % (tree_train, tree_test))

결정 트리의 훈련 정확도/테스트 정확도 0.688/0.700


In [550]:
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print('에이다 부스트의 훈련 정확도/테스트 정확도 %.3f/%.3f' % (ada_train, ada_test))



에이다 부스트의 훈련 정확도/테스트 정확도 0.699/0.700


In [551]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=100, min_samples_leaf=1, min_samples_split=2)
bag = BaggingClassifier(estimator=tree, n_estimators=100, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=100)
bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)
bag_train = accuracy_score(y_train, y_train_pred)
bag_test = accuracy_score(y_test, y_test_pred)
print('배깅의 훈련 정확도/테스트 정확도 %.3f/%.3f' % (bag_train, bag_test))

배깅의 훈련 정확도/테스트 정확도 0.705/0.716


In [552]:
weight_columns = [col for col in df_concat_weight.columns]
final_data = test_data[weight_columns]
y_final_pred = bag.predict(final_data)

In [498]:
submission = pd.read_csv("submission.csv")
submission["target"] = y_final_pred

In [499]:
# target 값이 1이면 AbNormal, 0이면 Normal
submission["target"] = submission["target"].map({1: "AbNormal", 0: "Normal"})

In [500]:
submission.to_csv("sub_submission.csv", index=False)

In [1034]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_test)
X_train = X_train.astype('float32')
y_train = y_train_encoded.astype('float32')
X_valid = X_test.astype('float32')
y_valid = y_valid_encoded.astype('float32')

model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))  # 첫 번째 Dense 레이어
model.add(BatchNormalization())  # 배치 정규화
model.add(Dropout(0.5))  # 드롭아웃
model.add(Dense(32, activation='relu'))  # 두 번째 Dense 레이어
model.add(BatchNormalization())  # 배치 정규화
model.add(Dropout(0.5))  # 드롭아웃
model.add(Dense(1, activation='sigmoid'))  # 이진 분류를 위한 출력 레이어
# 모델 컴파일
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 모델 학습
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_valid, y_valid))

# 평가
loss, accuracy = model.evaluate(X_valid, y_valid)
print(f'Accuracy: {accuracy:.4f}')

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.5378 - loss: 0.8032 - val_accuracy: 0.6673 - val_loss: 0.6745
Epoch 2/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6045 - loss: 0.7197 - val_accuracy: 0.6986 - val_loss: 0.6643
Epoch 3/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6293 - loss: 0.6840 - val_accuracy: 0.7006 - val_loss: 0.6517
Epoch 4/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6463 - loss: 0.6594 - val_accuracy: 0.7094 - val_loss: 0.6261
Epoch 5/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6154 - loss: 0.6600 - val_accuracy: 0.7094 - val_loss: 0.5884
Epoch 6/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6392 - loss: 0.6229 - val_accuracy: 0.6986 - val_loss: 0.5771
Epoch 7/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━

In [487]:
y_final_pred = model.predict(final_data)

[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 676us/step


In [488]:
submission = pd.read_csv("submission.csv")
submission["target"] = y_final_pred

In [489]:
# target 값이 0.5보다 크면 AbNormal, 0.5보다 작으면 Normal
submission["target"] = submission["target"].apply(lambda x: "AbNormal" if x > 0.5 else "Normal")

In [490]:
submission.to_csv("sub_submission.csv", index=False)