In [1]:
# SMOGN 数据增强：基于清洗后的数据，扩充高脂尾部样本
import pandas as pd
import numpy as np
from pathlib import Path
import smogn

np.random.seed(42)

input_path = Path('../data/processed/data_cleaned.xlsx')
output_path = Path('../data/processed/row_data_augment.xlsx')
target_col = 'lipid(%)'

df_clean = pd.read_excel(input_path)

# 定义 phi 控制点：<=P75 相关性为 0，仅在高脂尾部提升相关性
ctrl_pts = [
    [float(df_clean[target_col].min()), 0, 0],
    [float(df_clean[target_col].quantile(0.75)), 0, 0],
    [float(df_clean[target_col].max()), 1, 0],
]

smogn_synth = smogn.smoter(
    data=df_clean,
    y=target_col,
    k=5,
    pert=0.02,
    samp_method='balance',
    under_samp=False,
    rel_thres=0.8,
    rel_method='manual',
    rel_xtrm_type='high',
    rel_coef=1.5,
    rel_ctrl_pts_rg=ctrl_pts,
)

df_aug = pd.concat([df_clean, smogn_synth], ignore_index=True)
output_path.parent.mkdir(parents=True, exist_ok=True)
df_aug.to_excel(output_path, index=False)

print(f"SMOGN done: 原始 {len(df_clean)} 条, 合成 {len(smogn_synth)} 条, 合并后 {len(df_aug)} 条 -> {output_path}")
print('原始 lipid(%) 分布:')
print(df_clean[target_col].describe())
print('增强后 lipid(%) 分布:')
print(df_aug[target_col].describe())


dist_matrix: 100%|##########| 4/4 [00:00<00:00, 2846.97it/s]
synth_matrix: 100%|##########| 4/4 [00:00<00:00, 348.26it/s]
r_index: 100%|##########| 2/2 [00:00<00:00, 740.98it/s]

SMOGN done: 原始 36 条, 合成 14 条, 合并后 50 条 -> ../data/processed/row_data_augment.xlsx
原始 lipid(%) 分布:
count    36.000000
mean      5.073559
std       4.895851
min       0.165096
25%       1.207250
50%       2.998121
75%       8.880500
max      14.888481
Name: lipid(%), dtype: float64
增强后 lipid(%) 分布:
count    50.000000
mean      7.562991
std       5.786305
min       0.165096
25%       1.483669
50%       6.830000
75%      13.566743
max      14.909689
Name: lipid(%), dtype: float64



