In [1]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# 모델 로드
xgb_model = joblib.load(r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\final\FinalTrainedModels\xgboost_model_with_pca.joblib")
rf_model = joblib.load(r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\final\FinalTrainedModels\randomforest_model_with_pca.joblib")

# 기존 학습에서 사용했던 scaler와 PCA 로드
scaler = joblib.load(r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\final\FinalTrainedModels\scaler_model.joblib")  # 학습 시 저장된 scaler 파일
pca = joblib.load(r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\final\FinalTrainedModels\pca_model.joblib")  # 학습 시 저장된 PCA 파일

# 메타데이터 추출 함수
def extract_metadata(file_path):
    metadata = {}
    
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        
        # 파일에서 메타데이터 추출
        for line in lines:
            if line.startswith('Date'):
                metadata['Date'] = line.strip().split(',')[1]
            elif line.startswith('Filename'):
                metadata['Filename'] = line.strip().split(',')[1]
            elif line.startswith('Data Label'):
                metadata['Data Label'] = line.strip().split(',')[1]
            elif line.startswith('Label No'):
                metadata['Label No'] = line.strip().split(',')[1]
            elif line.startswith('Motor Spec'):
                motor_spec = line.strip().split(',')[1:]
                metadata['Motor Spec'] = motor_spec
    
    return metadata

# 센서 데이터 추출 함수
def extract_sensor_data(file_path):
    # 파일에서 센서 데이터를 읽기
    data = pd.read_csv(file_path, skiprows=10, header=None)
    data.columns = ['time','Sensor1', 'Sensor2', 'Sensor3','']
    return data

# 피처 계산 함수
def calculate_features(data):
    features = {}

    # 절댓값 평균
    features['Mean'] = np.mean(np.abs(data))

    # 절댓값 최대값
    features['Max'] = np.max(np.abs(data))

    # RMS (Root Mean Square)
    features['RMS'] = np.sqrt(np.mean(data ** 2))

    # Skewness
    features['Skewness'] = pd.Series(data).skew()

    # Kurtosis
    features['Kurtosis'] = pd.Series(data).kurt()

    # Crest Factor
    rms = features['RMS']
    features['Crest Factor'] = features['Max'] / rms if rms != 0 else 0

    # Impulse Factor
    mean = features['Mean']
    features['Impulse Factor'] = features['Max'] / mean if mean != 0 else 0

    # Shape Factor
    features['Shape Factor'] = rms / mean if mean != 0 else 0

    return features

# 데이터 처리 함수
def load_and_process_new_data(file_path, scaler, pca, feature_names=None):
    # 메타데이터 추출
    metadata = extract_metadata(file_path)
    
    # 센서 데이터 추출
    sensor_data = extract_sensor_data(file_path)

    # 각 센서별 피처 계산
    overall_features = {'Label': metadata['Label No']}
    for sensor in ['Sensor1', 'Sensor2', 'Sensor3']:
        features = calculate_features(sensor_data[sensor].values)
        for feature_name, value in features.items():
            overall_features[f'{sensor}_{feature_name}'] = value

    # Motor Spec_Period 추가 (예시: motor spec에서 주기를 계산하여 추가)
    motor_spec_period = 1  # 예시로 1로 설정, 실제 계산 필요
    overall_features['Motor Spec_Period'] = motor_spec_period

    # 피처와 레이블을 데이터프레임으로 변환
    combined_df = pd.DataFrame([overall_features])

    # 피처 (X)와 레이블 (y) 분리
    X = combined_df.drop(columns=['Label'])
    y = combined_df['Label']

    # feature_names에 맞춰서 새 데이터의 순서를 정렬
    if feature_names:
        X = X[feature_names]  # 학습 시 사용한 특성 순서로 정렬

    # 데이터 스케일링
    X_scaled = scaler.transform(X)

    # PCA 적용
    X_pca = pca.transform(X_scaled)

    return X_pca, y

# 디렉토리 내 모든 파일 처리
def process_all_files_in_directory(directory_path, scaler, pca, xgb_model, rf_model, feature_names=None):
    result = []  # 결과를 저장할 리스트

    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".csv"):  # .csv 파일을 처리
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")

                # 새로운 데이터에 대해 처리 및 예측 수행
                X_new_pca, y_new = load_and_process_new_data(file_path, scaler, pca, feature_names)

                # 예측 수행 (XGBoost 모델)
                xgb_pred = xgb_model.predict(X_new_pca)
                print(f"XGBoost Prediction for {file}: {xgb_pred}")

                # 예측 수행 (RandomForest 모델)
                rf_pred = rf_model.predict(X_new_pca)
                print(f"RandomForest Prediction for {file}: {rf_pred}")

                # 결과를 리스트에 저장
                result.append({
                    'Filename': file,
                    'XGBoost Prediction': xgb_pred[0],
                    'RandomForest Prediction': rf_pred[0],
                    'True Label': y_new.values[0]
                })
    
    # 결과를 데이터프레임으로 변환
    result_df = pd.DataFrame(result)

    # prediction이 0이면 'F', 3이면 'T' 추가
    result_df['XGBoost Prediction Label'] = result_df['XGBoost Prediction'].apply(lambda x: 'T' if x == 3 else 'F')
    result_df['RandomForest Prediction Label'] = result_df['RandomForest Prediction'].apply(lambda x: 'T' if x == 3 else 'F')

    return result_df

# T의 비율 계산 함수
def calculate_t_ratio(result_df):
    # XGBoost의 'T' 비율 계산
    xgb_t_count = result_df[result_df['XGBoost Prediction Label'] == 'T'].shape[0]
    xgb_total_count = result_df.shape[0]
    xgb_t_ratio = xgb_t_count / xgb_total_count if xgb_total_count > 0 else 0

    # RandomForest의 'T' 비율 계산
    rf_t_count = result_df[result_df['RandomForest Prediction Label'] == 'T'].shape[0]
    rf_total_count = result_df.shape[0]
    rf_t_ratio = rf_t_count / rf_total_count if rf_total_count > 0 else 0

    print(f"\nXGBoost Model 'T' Ratio: {xgb_t_ratio * 100:.2f}%")
    print(f"RandomForest Model 'T' Ratio: {rf_t_ratio * 100:.2f}%")

# 학습 데이터에서 특성 이름 추출
feature_names = [
    'Sensor1_Mean', 'Sensor1_Max', 'Sensor1_RMS', 'Sensor1_Skewness', 'Sensor1_Kurtosis', 
    'Sensor1_Crest Factor', 'Sensor1_Impulse Factor', 'Sensor1_Shape Factor', 
    'Sensor2_Mean', 'Sensor2_Max', 'Sensor2_RMS', 'Sensor2_Skewness', 'Sensor2_Kurtosis', 
    'Sensor2_Crest Factor', 'Sensor2_Impulse Factor', 'Sensor2_Shape Factor', 
    'Sensor3_Mean', 'Sensor3_Max', 'Sensor3_RMS', 'Sensor3_Skewness', 'Sensor3_Kurtosis', 
    'Sensor3_Crest Factor', 'Sensor3_Impulse Factor', 'Sensor3_Shape Factor',
    'Motor Spec_Period'  # Motor Spec_Period 추가
]

# 디렉토리 경로
directory_path = r"C:\Users\KIM\Desktop\bigdataproject\bigdataproject\final\평가데이터_2.2kW\L-DSF-01\축정렬불량"  # 여기에 새로운 데이터 파일이 들어있는 디렉토리 경로를 입력

# 디렉토리 내 모든 파일 처리 및 결과 출력
result_df = process_all_files_in_directory(directory_path, scaler, pca, xgb_model, rf_model, feature_names=feature_names)
print("\nPrediction Results:")
print(result_df)

# 'T' 비율 계산
calculate_t_ratio(result_df)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\final\평가데이터_2.2kW\L-DSF-01\축정렬불량\STFCB-20201012-0105-0137_20201103_121605_002.csv
XGBoost Prediction for STFCB-20201012-0105-0137_20201103_121605_002.csv: [3]
RandomForest Prediction for STFCB-20201012-0105-0137_20201103_121605_002.csv: [3]
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\final\평가데이터_2.2kW\L-DSF-01\축정렬불량\STFCB-20201012-0105-0137_20201103_121606_002.csv
XGBoost Prediction for STFCB-20201012-0105-0137_20201103_121606_002.csv: [3]
RandomForest Prediction for STFCB-20201012-0105-0137_20201103_121606_002.csv: [3]
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\final\평가데이터_2.2kW\L-DSF-01\축정렬불량\STFCB-20201012-0105-0137_20201103_121607_002.csv
XGBoost Prediction for STFCB-20201012-0105-0137_20201103_121607_002.csv: [3]
RandomForest Prediction for STFCB-20201012-0105-0137_20201103_121607_002.csv: [3]
Processing file: C:\Users\KIM\Desktop\bigdataproject\bigdataproject\final