In [None]:
# -*- coding: utf-8 -*-
import os, warnings, json, joblib
import numpy as np, pandas as pd
from tqdm import tqdm
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [None]:
CURVES_DIR = './curves'
IMAGES_INDEX_XLSX = 'parsed_image_info.xlsx'
TRAIN_IMG_DIR = './train_images'
TEST_IMG_DIR  = './test_images'
SS_PATH = 'ss.pkl'
BEST_PARAM_JSON = 'best_params_GeomSVR.json'
BEST_MODEL_PATH = 'best_GeomSVR.pkl'
XSCALER_PATH = 'scaler_X_SVR.pkl'
SEED = 20
np.random.seed(SEED)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
warnings.filterwarnings("ignore")

FORCE_TUNE = False
N_ITER     = 20
CV_FOLDS   = 3

In [None]:
def list_names_from_dir(d): return set(f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f)))


In [None]:
def build_labels():
    labels = pd.DataFrame(columns=['name'] + [f'y{i}' for i in range(200)])
    for file in tqdm(os.listdir(CURVES_DIR), desc='Read curves'):
        if not file.lower().endswith(('.xlsx','.xls','.csv')): continue
        tmp = pd.read_excel(os.path.join(CURVES_DIR, file), index_col=0).iloc[1:,].T
        tmp = tmp.reset_index(); tmp.columns = ['name'] + [f'y{i}' for i in range(200)]
        labels = pd.concat([labels, tmp], axis=0, ignore_index=True)
    return pd.merge(pd.read_excel(IMAGES_INDEX_XLSX), labels, on='name', how='inner')

def get_split(labels):
    train_csv = labels[labels['name'].isin(list_names_from_dir(TRAIN_IMG_DIR))].reset_index(drop=True)
    test_csv  = labels[labels['name'].isin(list_names_from_dir(TEST_IMG_DIR))].reset_index(drop=True)
    print(f"Train samples: {len(train_csv)}, Test samples: {len(test_csv)}"); return train_csv, test_csv

def get_geom_X(df, use_auto=True, geom_param_cols=None):
    cols = ([df.columns[1]] + df.columns[2:11].tolist()) if use_auto else (['porosity'] + geom_param_cols)
    return df[cols].to_numpy(np.float32), cols

def eval_curves(y_true_is, y_pred_is, names, tag):
    r2, area = [], []
    for i in range(y_true_is.shape[0]):
        r2.append(metrics.r2_score(y_true_is[i,:], y_pred_is[i,:]))
        denom = np.trapz(y_true_is[i,:], dx=1.0); nume = np.trapz(y_pred_is[i,:], dx=1.0)
        area.append((nume/denom) if denom!=0 else np.nan)
    eps=1e-8; denomM=np.maximum(np.abs(y_true_is), eps)
    rel = np.sqrt(np.mean(((y_pred_is - y_true_is)/denomM)**2, axis=1))
    return pd.DataFrame({'name': names.reset_index(drop=True),'Model':tag,'R2':r2,'AreaRatio':area,'RelRMSE':rel})

if __name__ == "__main__":
    labels = build_labels()
    train_csv, test_csv = get_split(labels)
    y_cols = [f'y{i}' for i in range(200)]

    if os.path.exists(SS_PATH): ss = joblib.load(SS_PATH)
    else: ss = StandardScaler().fit(labels[y_cols].to_numpy(np.float32)); joblib.dump(ss, SS_PATH)
    y_train_std = ss.transform(train_csv[y_cols].to_numpy(np.float32))
    y_test_std  = ss.transform(test_csv[y_cols].to_numpy(np.float32))
    y_test_true_is = ss.inverse_transform(y_test_std)

    USE_AUTO_GEOM_COLS = True
    geom_param_cols = ['p1','p2','p3','p4','p5','p6','p7','p8','p9']
    X_train, used_cols = get_geom_X(train_csv, USE_AUTO_GEOM_COLS, geom_param_cols)
    X_test,  _         = get_geom_X(test_csv,  USE_AUTO_GEOM_COLS, geom_param_cols)
    print("Geom features:", used_cols)

    # X 标准化（SVR 对输入尺度敏感）
    xscaler = StandardScaler().fit(X_train)
    X_train_s = xscaler.transform(X_train)
    X_test_s  = xscaler.transform(X_test)
    joblib.dump(xscaler, XSCALER_PATH)

    do_tune = FORCE_TUNE or (not os.path.exists(BEST_PARAM_JSON))
    if do_tune:
        base = MultiOutputRegressor(SVR(kernel='rbf'))
        param_dist = {
            'estimator__C': [1, 3, 10, 30, 100],
            'estimator__epsilon': [0.001, 0.01, 0.05, 0.1],
            'estimator__gamma': ['scale', 'auto', 0.01, 0.1, 1.0]
        }
        search = RandomizedSearchCV(
            estimator=base, param_distributions=param_dist,
            n_iter=N_ITER, cv=CV_FOLDS, n_jobs=-1, verbose=1, random_state=SEED
        )
        search.fit(X_train_s, y_train_std)
        best_params = search.best_params_
        json.dump(best_params, open(BEST_PARAM_JSON,'w'), indent=2)
        print(">>> Saved best params to", BEST_PARAM_JSON)
    else:
        best_params = json.load(open(BEST_PARAM_JSON,'r'))
        print(">>> Loaded cached params from", BEST_PARAM_JSON)

    svr_best = MultiOutputRegressor(SVR(kernel='rbf'))
    svr_best.set_params(**best_params)
    svr_best.fit(X_train_s, y_train_std)
    joblib.dump(svr_best, BEST_MODEL_PATH)
    print(">>> Saved best model to", BEST_MODEL_PATH)

    y_pred_std = svr_best.predict(X_test_s)
    y_pred_is  = ss.inverse_transform(y_pred_std)
    df = eval_curves(y_test_true_is, y_pred_is, test_csv['name'], 'Geom-SVR-RBF-Tuned')
    df.to_excel('metrics_GeomSVR_Tuned.xlsx', index=False)
    pd.DataFrame(y_pred_is).assign(name=test_csv['name'].values).to_excel('pred_test_GeomSVR_Tuned.xlsx', index=False)
    print(df[['R2','AreaRatio','RelRMSE']].mean())