In [2]:
import os
import numpy as np
import pandas as pd
from charset_normalizer import detect
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import joblib

In [3]:
# 파일 인코딩 감지 함수
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    return detect(raw_data)['encoding']

In [4]:
# 피처 계산 함수
def calculate_features(data):
    features = {}

    # 절댓값 평균
    features['Mean'] = np.mean(np.abs(data))

    # 절댓값 최대값
    features['Max'] = np.max(np.abs(data))

    # RMS (Root Mean Square)
    features['RMS'] = np.sqrt(np.mean(data ** 2))

    # Skewness
    features['Skewness'] = pd.Series(data).skew()

    # Kurtosis
    features['Kurtosis'] = pd.Series(data).kurt()

    # Crest Factor
    rms = features['RMS']
    features['Crest Factor'] = features['Max'] / rms if rms != 0 else 0

    # Impulse Factor
    mean = features['Mean']
    features['Impulse Factor'] = features['Max'] / mean if mean != 0 else 0

    # Shape Factor
    features['Shape Factor'] = rms / mean if mean != 0 else 0

    return features

In [5]:
# 개별 파일 처리 함수 (파일 전체에 대한 피처 계산)
def process_file_with_overall_features(file_path):
    try:
        encoding = detect_encoding(file_path)
        with open(file_path, 'r', encoding=encoding) as f:
            lines = f.readlines()

        if len(lines) < 10:
            print(f"Skipping file {file_path}: Insufficient header rows")
            return None

        label_info_line = lines[3].strip().split(',')
        if len(label_info_line) < 2:
            print(f"Skipping file {file_path}: Malformed label line")
            return None
        label_info = label_info_line[1]

        label_mapping = {
            "00": 0,  # 정상
            "01": 1,  # 베어링 불량
            "02": 2,  # 회전체 불평형
            "03": 3,  # 축 정렬 불량
            "04": 4   # 벨트 느슨함
        }

        label = label_mapping.get(label_info)
        if label is None:
            print(f"Skipping file {file_path}: Unknown label {label_info}")
            return None

        # CSV 데이터 읽기
        data = pd.read_csv(file_path, skiprows=9, header=None, usecols=[0, 1, 2, 3], on_bad_lines="skip", encoding=encoding)
        data.columns = ['Time', 'Sensor1', 'Sensor2', 'Sensor3']

        # 각 센서별 피처 계산
        overall_features = {'Label': label}
        for sensor in ['Sensor1', 'Sensor2', 'Sensor3']:
            features = calculate_features(data[sensor].values)
            for feature_name, value in features.items():
                overall_features[f'{sensor}_{feature_name}'] = value

        return overall_features

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

In [6]:
# 모든 파일 처리 함수 (XGBoost 및 RandomForest 모델 학습 및 저장)
def train_models(base_folder, xgb_model_output, rf_model_output):
    all_features = []

    for root, dirs, files in os.walk(base_folder):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")
                features = process_file_with_overall_features(file_path)
                if features is not None:
                    all_features.append(features)

    if not all_features:
        print("No valid data found.")
        return

    # 모든 피처를 데이터프레임으로 저장
    combined_df = pd.DataFrame(all_features)

    # 분리: X (피처)와 y (레이블)
    X = combined_df.drop(columns=['Label'])
    y = combined_df['Label']

    # 학습/테스트 데이터 분리
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # XGBoost 모델 학습
    print("Training XGBoost Model...")
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)

    # XGBoost 성능 계산
    f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
    precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
    recall_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
    accuracy_xgb = xgb_model.score(X_test, y_test)

    print(f"XGBoost Validation Accuracy: {accuracy_xgb:.2f}")
    print("\nXGBoost Classification Report:\n")
    print(classification_report(y_test, y_pred_xgb))

    print(f"\nXGBoost Precision: {precision_xgb:.2f}")
    print(f"XGBoost Recall: {recall_xgb:.2f}")
    print(f"XGBoost F1-Score: {f1_xgb:.2f}")

    # 모델 저장
    joblib.dump(xgb_model, xgb_model_output)
    print(f"XGBoost Model saved to: {xgb_model_output}")

    # RandomForest 모델 학습
    print("\nTraining RandomForest Model...")
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)

    # RandomForest 성능 계산
    f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
    precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
    recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
    accuracy_rf = rf_model.score(X_test, y_test)

    print(f"RandomForest Validation Accuracy: {accuracy_rf:.2f}")
    print("\nRandomForest Classification Report:\n")
    print(classification_report(y_test, y_pred_rf))

    print(f"\nRandomForest Precision: {precision_rf:.2f}")
    print(f"RandomForest Recall: {recall_rf:.2f}")
    print(f"RandomForest F1-Score: {f1_rf:.2f}")

    # 모델 저장
    joblib.dump(rf_model, rf_model_output)
    print(f"RandomForest Model saved to: {rf_model_output}")


In [12]:
# 실행
base_folder = r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__"
xgb_model_output = r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\models\xgb_model.joblib"
rf_model_output = r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\models\xgb_model.joblib"
train_models(base_folder, xgb_model_output, rf_model_output)

Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_224906_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_232806_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_232807_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_233005_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_233006_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_233007_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210120_050305_002.csv
Processing file: C:\Users\KIM\Desktop\big

Parameters: { "use_label_encoder" } are not used.



XGBoost Validation Accuracy: 0.99

XGBoost Classification Report:

              precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        22
           2       1.00      0.96      0.98        24
           3       1.00      1.00      1.00        23
           4       1.00      1.00      1.00        20

    accuracy                           0.99       160
   macro avg       0.99      0.98      0.99       160
weighted avg       0.99      0.99      0.99       160


XGBoost Precision: 0.99
XGBoost Recall: 0.99
XGBoost F1-Score: 0.99
XGBoost Model saved to: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\models\xgb_model.joblib

Training RandomForest Model...
RandomForest Validation Accuracy: 0.98

RandomForest Classification Report:

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        71
           1       1.00      0.86      0.93        22
    

In [11]:
import xgboost as xgb
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer  # 예시로 사용할 데이터셋

# 예시 데이터셋 로드
data = load_breast_cancer()
X = data.data
y = data.target

# 학습 데이터와 테스트 데이터로 분리
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost 모델 학습
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train, y_train)

# 모델 저장 경로
xgb_model_output = r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\models\xgb_model.joblib"
os.makedirs(os.path.dirname(xgb_model_output), exist_ok=True)

# 모델 저장
joblib.dump(xgb_model, xgb_model_output)
print(f"XGBoost Model saved to: {xgb_model_output}")


XGBoost Model saved to: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\models\xgb_model.joblib


json 형식으로 저장.

In [None]:
import os
import numpy as np
import pandas as pd
from charset_normalizer import detect
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import json

# 파일 인코딩 감지 함수
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    return detect(raw_data)['encoding']

# 피처 계산 함수
def calculate_features(data):
    features = {}

    # 절댓값 평균
    features['Mean'] = np.mean(np.abs(data))

    # 절댓값 최대값
    features['Max'] = np.max(np.abs(data))

    # RMS (Root Mean Square)
    features['RMS'] = np.sqrt(np.mean(data ** 2))

    # Skewness
    features['Skewness'] = pd.Series(data).skew()

    # Kurtosis
    features['Kurtosis'] = pd.Series(data).kurt()

    # Crest Factor
    rms = features['RMS']
    features['Crest Factor'] = features['Max'] / rms if rms != 0 else 0

    # Impulse Factor
    mean = features['Mean']
    features['Impulse Factor'] = features['Max'] / mean if mean != 0 else 0

    # Shape Factor
    features['Shape Factor'] = rms / mean if mean != 0 else 0

    return features

# 개별 파일 처리 함수 (파일 전체에 대한 피처 계산)
def process_file_with_overall_features(file_path):
    try:
        encoding = detect_encoding(file_path)
        with open(file_path, 'r', encoding=encoding) as f:
            lines = f.readlines()

        if len(lines) < 10:
            print(f"Skipping file {file_path}: Insufficient header rows")
            return None

        label_info_line = lines[3].strip().split(',')
        if len(label_info_line) < 2:
            print(f"Skipping file {file_path}: Malformed label line")
            return None
        label_info = label_info_line[1]

        label_mapping = {
            "00": 0,  # 정상
            "01": 1,  # 베어링 불량
            "02": 2,  # 회전체 불평형
            "03": 3,  # 축 정렬 불량
            "04": 4   # 벨트 느슨함
        }

        label = label_mapping.get(label_info)
        if label is None:
            print(f"Skipping file {file_path}: Unknown label {label_info}")
            return None

        # CSV 데이터 읽기
        data = pd.read_csv(file_path, skiprows=9, header=None, usecols=[0, 1, 2, 3], on_bad_lines="skip", encoding=encoding)
        data.columns = ['Time', 'Sensor1', 'Sensor2', 'Sensor3']

        # 각 센서별 피처 계산
        overall_features = {'Label': label}
        for sensor in ['Sensor1', 'Sensor2', 'Sensor3']:
            features = calculate_features(data[sensor].values)
            for feature_name, value in features.items():
                overall_features[f'{sensor}_{feature_name}'] = value

        return overall_features

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# 모든 파일 처리 함수 (XGBoost 및 RandomForest 모델 학습 및 저장)
def train_models(base_folder, xgb_model_output, rf_model_output):
    all_features = []

    for root, dirs, files in os.walk(base_folder):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")
                features = process_file_with_overall_features(file_path)
                if features is not None:
                    all_features.append(features)

    if not all_features:
        print("No valid data found.")
        return

    # 모든 피처를 데이터프레임으로 저장
    combined_df = pd.DataFrame(all_features)

    # 분리: X (피처)와 y (레이블)
    X = combined_df.drop(columns=['Label'])
    y = combined_df['Label']

    # 학습/테스트 데이터 분리
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # XGBoost 모델 학습
    print("Training XGBoost Model...")
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)

    # XGBoost 성능 계산
    f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
    precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
    recall_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
    accuracy_xgb = xgb_model.score(X_test, y_test)

    print(f"XGBoost Validation Accuracy: {accuracy_xgb:.2f}")
    print("\nXGBoost Classification Report:\n")
    print(classification_report(y_test, y_pred_xgb))

    print(f"\nXGBoost Precision: {precision_xgb:.2f}")
    print(f"XGBoost Recall: {recall_xgb:.2f}")
    print(f"XGBoost F1-Score: {f1_xgb:.2f}")

    # XGBoost 모델을 JSON 형식으로 저장
    xgb_model_json = {
        'params': xgb_model.get_params(),
        'booster': xgb_model.get_booster().get_dump()
    }
    with open(xgb_model_output, 'w') as f:
        json.dump(xgb_model_json, f, indent=4)
    print(f"XGBoost Model saved to: {xgb_model_output}")

    # RandomForest 모델 학습
    print("\nTraining RandomForest Model...")
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)

    # RandomForest 성능 계산
    f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
    precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
    recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
    accuracy_rf = rf_model.score(X_test, y_test)

    print(f"RandomForest Validation Accuracy: {accuracy_rf:.2f}")
    print("\nRandomForest Classification Report:\n")
    print(classification_report(y_test, y_pred_rf))

    print(f"\nRandomForest Precision: {precision_rf:.2f}")
    print(f"RandomForest Recall: {recall_rf:.2f}")
    print(f"RandomForest F1-Score: {f1_rf:.2f}")

    # RandomForest 모델을 JSON 형식으로 저장
    rf_model_json = {
        'params': rf_model.get_params(),
        'estimators': []  # 트리들에 대한 정보
    }
    for tree in rf_model.estimators_:
        # 트리의 각 속성 추출
        tree_state = {
            'children_left': tree.tree_.children_left.tolist(),
            'children_right': tree.tree_.children_right.tolist(),
            'feature': tree.tree_.feature.tolist(),
            'threshold': tree.tree_.threshold.tolist(),
            'value': tree.tree_.value.tolist(),
            'impurity': tree.tree_.impurity.tolist()
        }

        rf_model_json['estimators'].append(tree_state)

    with open(rf_model_output, 'w') as f:
        json.dump(rf_model_json, f, indent=4)
    print(f"RandomForest Model saved to: {rf_model_output}")

# 실행
base_folder = r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__"
xgb_model_output = r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\models\xgb_model.json"
rf_model_output = r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\models\rf_model.json"
train_models(base_folder, xgb_model_output, rf_model_output)


Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_224906_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_232806_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_232807_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_233005_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_233006_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210119_233007_002.csv
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\pycache\__pycache__\STFCB-20200928-0105-0106_20210120_050305_002.csv
Processing file: C:\Users\KIM\Desktop\big

Parameters: { "use_label_encoder" } are not used.



XGBoost Validation Accuracy: 0.99

XGBoost Classification Report:

              precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        22
           2       1.00      0.96      0.98        24
           3       1.00      1.00      1.00        23
           4       1.00      1.00      1.00        20

    accuracy                           0.99       160
   macro avg       0.99      0.98      0.99       160
weighted avg       0.99      0.99      0.99       160


XGBoost Precision: 0.99
XGBoost Recall: 0.99
XGBoost F1-Score: 0.99
XGBoost Model saved to: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\models\xgb_model.json

Training RandomForest Model...
RandomForest Validation Accuracy: 0.98

RandomForest Classification Report:

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        71
           1       1.00      0.86      0.93        22
      