In [1]:
from preamble import *

In [5]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re

In [None]:
# 1. 데이터 로딩 및 전처리
def load_and_preprocess(data_path):
  df = pd.read_csv(data_path)  # 데이터 경로에 맞게 수정

  # DATA_ARRAY 문자열을 숫자 배열로 변환 (수정된 부분)
  def parse_data_array(data_str):
    data_str = re.sub(r'\s+', ' ', data_str).strip()
    
    try:
      return np.fromstring(data_str[1:-1], sep=' ')
    except ValueError:
      print(f"Warning: Could not parse data array: {data_str}")
      return np.array([])

  df['data_array'] = df['data_array'].apply(parse_data_array)

  # PROC_STAT 값 조정 (0: 고장, 1, 2: 정상 -> 0: 고장, 1: 정상)
  df['proc_stat'] = df['proc_stat'].apply(lambda x: 0 if x == 0 else 1)
  return df

# 1.2 특징 추출 함수
def extract_features(data_array):
  x = np.array(data_array)
  # 1초 데이터만 사용
  x = x[:len(x)//2] # 앞 1초 데이터만 사용

  features = []
  # 7가지 통계적 특징 계산
  features.append(np.max(x) - np.min(x))  # Peak to Peak
  features.append(np.mean(x))           # Mean
  # Standard Deviation 계산 전 길이 확인
  if len(x) > 1:
    features.append(np.std(x, ddof=1))      # Standard Deviation
  else:
    features.append(0)  # 또는 np.nan 등 적절한 값

  features.append(np.sqrt(np.mean(x**2)))  # RMS

  # Crest Factor 계산 전 길이 및 RMS 값 확인
  if len(x) > 1 and features[3] != 0 :
    features.append(np.max(np.abs(x)) / features[3])  # Crest Factor
  else:
    features.append(0)

  features.append(np.mean(((x - features[1]) / features[2])**3))  # Skewness
  features.append(np.mean(((x - features[1]) / features[2])**4))  # Kurtosis
  return features

In [7]:
# 1.3 데이터프레임에 특징 추가
def create_feature_df(df):
  feature_list = []
  for _, row in df.iterrows():
    features = extract_features(row['data_array'])
    feature_list.append(features)

  feature_df = pd.DataFrame(feature_list, columns=[f'feature_{i}' for i in range(7)])

  # 채널별 접미사 추가
  channel_features = []
  for channel in df['channel_id'].unique():
    channel_df = feature_df[df['channel_id'] == channel].reset_index(drop=True)
    channel_df.columns = [f"{col}_{channel}" for col in channel_df.columns]
    channel_features.append(channel_df)
  feature_df = pd.concat(channel_features, axis=1)

  final_df = pd.concat([df.reset_index(drop=True), feature_df], axis=1)
  return final_df

In [8]:
data_path = './data/pms_data_decompressed.csv'  # 실제 데이터 경로
df = load_and_preprocess(data_path)
final_df = create_feature_df(df)

  return np.fromstring(data_str[1:-1], sep=' ')
  features.append(np.mean(((x - features[1]) / features[2])**3))  # Skewness
  features.append(np.mean(((x - features[1]) / features[2])**4))  # Kurtosis


In [19]:
# 2.  데이터 분할 (시간순으로)
final_df = final_df.sort_values(by='acq_date')
X = final_df.drop(['motor_id', 'equipment_id', 'center_id', 'channel_id', 'acq_date', 'data_array', 'proc_stat'], axis=1)
y = final_df['proc_stat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [20]:
param_grid = {
  'n_estimators': [100, 200, 300],
  'max_depth': [3, 4, 5],
  'learning_rate': [0.01, 0.1, 0.2],
  'subsample': [0.8, 1.0],
  'colsample_bytree': [0.8, 1.0],
  'gamma': [0, 0.1, 0.2],
  'reg_alpha': [0, 0.01, 0.1],
  'reg_lambda': [0, 0.01, 0.1]
}

In [21]:
# 3. XGBoost 모델 학습 및 튜닝
xgb_model = xgb.XGBClassifier(eval_metric='logloss')  # 경고 메시지 방지
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 2916 candidates, totalling 14580 fits


In [22]:
# 최적 모델 및 파라미터
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.8}


In [23]:
# 4. 모델 평가
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.999202006383949
[[    0    14]
 [    0 17530]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       1.00      1.00      1.00     17530

    accuracy                           1.00     17544
   macro avg       0.50      0.50      0.50     17544
weighted avg       1.00      1.00      1.00     17544



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# 5. 변수 중요도
feature_importance = best_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)

             Feature  Importance
114   feature_2_CHO0        0.07
171   feature_3_CHO8        0.07
164   feature_3_CHO7        0.07
190  feature_1_CHO11        0.07
127   feature_1_CHO2        0.06
..               ...         ...
86   feature_2_CHN12        0.00
88   feature_4_CHN12        0.00
89   feature_5_CHN12        0.00
90   feature_6_CHN12        0.00
237  feature_6_CHO17        0.00

[238 rows x 2 columns]
