In [3]:
pip install pandas numpy Scipy

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.5.2 scikit-learn-1.7.2 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install imblearn 

Collecting imblearn
  Using cached imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Using cached imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Using cached imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Using cached imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.14.0 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("./raw_total_data.csv")
df.describe().to_csv("raw_data_describe.csv")

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# ==================== 1️⃣ 물리 파라미터 ====================
class GeometryParams:
    def __init__(self):
        self.L_total = 47e-3
        self.W_base = 20e-3
        self.W_tongue = 12e-3
        self.t = 1.6e-3
        self.K_t_slot = 2.1
        self.K_t_width = 1.2

class MaterialProps:
    def __init__(self):
        self.E = 90e9
        self.nu = 0.25
        self.alpha_T = 8e-6
        self.Ms = 250

# ==================== 데이터 로드 ====================
def load_data(raw_path="./raw_total_data.csv", label_path="./label.xlsx"):
    df_raw = pd.read_csv(raw_path, encoding='utf-8')
    df_label = pd.read_excel(label_path)
    return df_raw, df_label

# ==================== 컬럼명 표준화 ====================
def standardize_columns(df):
    mapping = {
        '배정번호': 'batch_id',
        '소입로 온도 4 Zone': 'T_soaking_4',
        '솔트 컨베이어 온도 1 Zone': 'T_conveyor_1',
        '솔트 컨베이어 온도 2 Zone': 'T_conveyor_2',
        '솔트조 온도 1 Zone': 'T_salt_1',
        '솔트조 온도 2 Zone': 'T_salt_2'
    }
    return df.rename(columns=mapping)

# ==================== 배정번호별 피처 계산 ====================
def aggregate_features(df, geo, mat):
    feats = []
    for bid, g in df.groupby('batch_id'):
        if len(g) < 2:  # 너무 짧은 데이터는 제외
            continue

        feat = {'batch_id': bid}
        T_soak = g['T_soaking_4'].mean()
        T_salt = g[['T_salt_1', 'T_salt_2']].mean(axis=1).mean()
        T_conv = g[['T_conveyor_1', 'T_conveyor_2']].mean(axis=1).mean()

        # 온도 구배 계산
        feat['gradient_to_salt_mean'] = T_soak - T_salt
        feat['gradient_2nd_mean'] = T_salt - T_conv
        feat['gradient_to_conv_mean'] = T_soak - T_conv
        feat['T_salt_std'] = g[['T_salt_1','T_salt_2']].std(axis=1).mean()
        feat['T_conv_std'] = g[['T_conveyor_1','T_conveyor_2']].std(axis=1).mean()
        
        # 열응력 추정
        delta_T = feat['gradient_to_salt_mean']
        sigma_base = mat.E * mat.alpha_T * delta_T / (1 - mat.nu)
        feat['sigma_slot'] = sigma_base * geo.K_t_slot
        feats.append(feat)

    return pd.DataFrame(feats)

# ==================== 라벨 처리 ====================
def process_labels(df_label):
    df = df_label.rename(columns={'배정번호':'batch_id','불량수량':'defect_count','양품수량':'normal_count'})
    df['defect_count'] = df['defect_count'].fillna(0)
    df['normal_count'] = df['normal_count'].fillna(0)
    df['defect_rate'] = df['defect_count'] / (df['defect_count'] + df['normal_count'] + 1e-6)
    return df[['batch_id','defect_rate']]

# ==================== 이상치 제거 ====================
def remove_outliers_iqr(df, cols, k=1.5):
    df_clean = df.copy()
    for col in cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - k * iqr
        upper = q3 + k * iqr
        before = len(df_clean)
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
        after = len(df_clean)
        print(f"  {col}: {before - after}개 이상치 제거 (남은 데이터 {after})")
    return df_clean

# ==================== 스케일링 및 로그 변환 ====================
def scale_and_log(df, feature_cols, target_col):
    df_scaled = df.copy()
    df_scaled[target_col] = np.log1p(df_scaled[target_col])  # 로그변환
    scaler = StandardScaler()
    df_scaled[feature_cols] = scaler.fit_transform(df_scaled[feature_cols])
    return df_scaled, scaler

# ==================== 전체 파이프라인 ====================
def preprocess_for_regression(raw_path="./raw_total_data.csv", label_path="./label.xlsx"):
    geo, mat = GeometryParams(), MaterialProps()
    df_raw, df_label = load_data(raw_path, label_path)
    df_raw = standardize_columns(df_raw)
    df_feat = aggregate_features(df_raw, geo, mat)
    df_label = process_labels(df_label)

    df = pd.merge(df_feat, df_label, on="batch_id", how="inner")

    print(f"\n초기 데이터 개수: {len(df)}")
    df = remove_outliers_iqr(df, ["gradient_to_salt_mean", "gradient_2nd_mean", 'gradient_to_conv_mean', "T_conv_std","T_salt_std"], k=1.5)
    print(f"이상치 제거 후: {len(df)}")

    features = ["gradient_to_salt_mean","gradient_2nd_mean",'gradient_to_conv_mean',"sigma_slot","T_salt_std","T_conv_std"]
    df_scaled, scaler = scale_and_log(df, features, "defect_rate")

    df_scaled.to_csv("processed_regression_data.csv", index=False, encoding="utf-8-sig")

    return df_scaled, scaler


In [11]:
if __name__ == "__main__":
    df_ready, scaler = preprocess_for_regression()



초기 데이터 개수: 45
  gradient_to_salt_mean: 11개 이상치 제거 (남은 데이터 34)
  gradient_2nd_mean: 1개 이상치 제거 (남은 데이터 33)
  gradient_to_conv_mean: 0개 이상치 제거 (남은 데이터 33)
  T_conv_std: 4개 이상치 제거 (남은 데이터 29)
  T_salt_std: 0개 이상치 제거 (남은 데이터 29)
이상치 제거 후: 29


In [2]:
pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to track which package uses
  [31m   [0m   'sklearn' instead of 'scikit-le