# 00 Prepare Data (Titanic)

`sklearn.datasets.fetch_openml` で Titanic データを取得し、前処理したうえで以降のNotebookで共通利用する Train/Test を作成して保存します。

In [72]:
from pathlib import Path
import json
import re
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split

CONFIG_PATH = Path('config/dataset_config.json')
OUTPUT_DIR = Path('data/processed')
TRAIN_PATH = OUTPUT_DIR / 'classification_train.csv'
TEST_PATH = OUTPUT_DIR / 'classification_test.csv'
META_PATH = OUTPUT_DIR / 'classification_meta.json'

SUPPORTED_CLASSIFICATION_DATASETS = {
    'titanic_openml_v1',
    'adult_openml_v2',
    'bank_openml_1461',
    'steel_openml_1504',
    'fraud_openml_1597',
    'steel_openml_1597',
    'wine_sklearn',
    'breast_cancer_sklearn'
}

config = json.loads(CONFIG_PATH.read_text(encoding='utf-8'))
task_type = str(config.get('task_type', 'classification')).lower()
dataset_id = str(config.get('dataset_id', 'titanic_openml_v1'))
SEED = int(config.get('random_seed', 42))
TEST_SIZE = float(config.get('test_size', 0.2))
ENABLE_DISTRIBUTION_NOISE = bool(config.get('enable_distribution_noise', False))
NOISE_MODE = str(config.get('noise_mode', 'gaussian_shift'))
NOISE_FEATURE_COUNT = int(config.get('noise_feature_count', 1))
NOISE_STRENGTH = float(config.get('noise_strength', 0.5))
NOISE_RANDOM_SEED = int(config.get('noise_random_seed', SEED + 7))
NOISE_SEGMENT_FRACTION = float(config.get('noise_segment_fraction', 0.15))
NOISE_LABEL_FLIP_RATE = float(config.get('noise_label_flip_rate', 0.30))

SUPPORTED_NOISE_MODES = {
    'gaussian_shift',
    'segment_shift',
    'segment_label_flip',
    'segment_shift_and_label_flip'
}

if task_type != 'classification':
    raise ValueError(f"Only classification is supported. Received task_type={task_type}")
if dataset_id not in SUPPORTED_CLASSIFICATION_DATASETS:
    raise ValueError(f"Unsupported dataset_id: {dataset_id}. Supported: {sorted(SUPPORTED_CLASSIFICATION_DATASETS)}")
if not (0.0 < TEST_SIZE < 1.0):
    raise ValueError(f"test_size must be in (0, 1). Received: {TEST_SIZE}")
if NOISE_FEATURE_COUNT not in [1, 2]:
    raise ValueError(f"noise_feature_count must be 1 or 2. Received: {NOISE_FEATURE_COUNT}")
if NOISE_STRENGTH <= 0:
    raise ValueError(f"noise_strength must be > 0. Received: {NOISE_STRENGTH}")
if NOISE_MODE not in SUPPORTED_NOISE_MODES:
    raise ValueError(f"noise_mode must be one of {sorted(SUPPORTED_NOISE_MODES)}. Received: {NOISE_MODE}")
if not (0.0 < NOISE_SEGMENT_FRACTION < 1.0):
    raise ValueError(f"noise_segment_fraction must be in (0,1). Received: {NOISE_SEGMENT_FRACTION}")
if not (0.0 <= NOISE_LABEL_FLIP_RATE <= 1.0):
    raise ValueError(f"noise_label_flip_rate must be in [0,1]. Received: {NOISE_LABEL_FLIP_RATE}")

print('Loaded config:')
print(config)
print(f'Output train path: {TRAIN_PATH}')
print(f'Output test path : {TEST_PATH}')

Loaded config:
{'task_type': 'classification', 'dataset_id': 'adult_openml_v2', 'test_size': 0.6, 'random_seed': 42, 'enable_distribution_noise': True, 'noise_mode': 'segment_shift_and_label_flip', 'noise_feature_count': 1, 'noise_strength': 0.5, 'noise_random_seed': 49, 'noise_segment_fraction': 0.15, 'noise_label_flip_rate': 0.3}
Output train path: data/processed/classification_train.csv
Output test path : data/processed/classification_test.csv


In [73]:
# 分類データセットを設定ファイルに基づいてロード
if dataset_id == 'titanic_openml_v1':
    ds = fetch_openml('titanic', version=1, as_frame=True)
    X_raw = ds.data.copy()
    y_raw = ds.target.copy()
    dataset_name = 'openml_titanic_v1'
elif dataset_id == 'adult_openml_v2':
    ds = fetch_openml('adult', version=2, as_frame=True)
    X_raw = ds.data.copy()
    y_raw = ds.target.copy()
    dataset_name = 'openml_adult_v2'
elif dataset_id == 'bank_openml_1461':
    # OpenML ID 1461: Bank Marketing
    ds = fetch_openml(data_id=1461, as_frame=True)
    X_raw = ds.data.copy()
    y_raw = ds.target.copy()
    dataset_name = 'openml_bank_marketing_1461'
elif dataset_id == 'steel_openml_1504':
    # OpenML ID 1504: Steel Plates Faults
    ds = fetch_openml(data_id=1504, as_frame=True)
    X_raw = ds.data.copy()
    y_raw = ds.target.copy()
    dataset_name = 'openml_steel_1504'
elif dataset_id in ['fraud_openml_1597', 'steel_openml_1597']:
    # OpenML ID 1597: Credit Card Fraud Detection
    ds = fetch_openml(data_id=1597, as_frame=True)
    X_raw = ds.data.copy()
    y_raw = ds.target.copy()
    dataset_name = 'openml_fraud_1597'
elif dataset_id == 'wine_sklearn':
    ds = load_wine(as_frame=True)
    X_raw = ds.data.copy()
    y_raw = ds.target.copy()
    dataset_name = 'sklearn_load_wine'
elif dataset_id == 'breast_cancer_sklearn':
    ds = load_breast_cancer(as_frame=True)
    X_raw = ds.data.copy()
    y_raw = ds.target.copy()
    dataset_name = 'sklearn_load_breast_cancer'
else:
    raise ValueError(f'Unsupported dataset_id: {dataset_id}')

# 目的変数を分類ラベルとして整数エンコード
y_cat = pd.Series(y_raw).astype('category')
target_names = [str(v) for v in y_cat.cat.categories]
y = y_cat.cat.codes.astype(int)
if y.nunique() < 2:
    raise ValueError('Classification requires at least 2 classes in target.')

# 数値列: 欠損を中央値で補完
num_cols = X_raw.select_dtypes(include=['number']).columns.tolist()
X_num = X_raw[num_cols].apply(pd.to_numeric, errors='coerce')
X_num = X_num.fillna(X_num.median())

# カテゴリ列: 欠損を'missing'で補完し、カテゴリコード化（ダミー化はしない）
cat_cols = [c for c in X_raw.columns if c not in num_cols]
X_cat = X_raw[cat_cols].astype('string').fillna('missing')
X_cat_encoded = pd.DataFrame(index=X_raw.index)
for col in cat_cols:
    X_cat_encoded[col] = X_cat[col].astype('category').cat.codes.astype('int32')

X = pd.concat([X_num, X_cat_encoded], axis=1)

# オプション: 学習を難しくするために特徴量分布へノイズ注入
noise_applied_features = []
label_flipped_count = 0
if ENABLE_DISTRIBUTION_NOISE:
    rng = np.random.default_rng(NOISE_RANDOM_SEED)
    noise_feature_count = min(NOISE_FEATURE_COUNT, X.shape[1])
    noise_applied_features = rng.choice(X.columns.to_numpy(), size=noise_feature_count, replace=False).tolist()
    segment_mask = rng.random(len(X)) < NOISE_SEGMENT_FRACTION
    classes = np.sort(y.unique())

    for feature in noise_applied_features:
        X[feature] = pd.to_numeric(X[feature], errors='coerce').astype(float)
        values = X[feature]
        finite_std = float(np.nanstd(values))
        if not np.isfinite(finite_std) or finite_std == 0.0:
            finite_std = 1.0

        if NOISE_MODE == 'gaussian_shift':
            affected_mask = rng.random(len(X)) < 0.35
            noise = rng.normal(loc=0.0, scale=NOISE_STRENGTH * finite_std, size=int(affected_mask.sum()))
            shifted = values.loc[affected_mask] + noise + (NOISE_STRENGTH * finite_std * 1.5)
            X.loc[affected_mask, feature] = shifted.values
        elif NOISE_MODE in ['segment_shift', 'segment_shift_and_label_flip']:
            noise = rng.normal(loc=0.0, scale=NOISE_STRENGTH * finite_std, size=int(segment_mask.sum()))
            shifted = values.loc[segment_mask] + noise + (NOISE_STRENGTH * finite_std * 3.0)
            X.loc[segment_mask, feature] = shifted.values

    if NOISE_MODE in ['segment_label_flip', 'segment_shift_and_label_flip']:
        flip_mask = segment_mask & (rng.random(len(X)) < NOISE_LABEL_FLIP_RATE)
        label_flipped_count = int(flip_mask.sum())
        if label_flipped_count > 0:
            if len(classes) == 2:
                y.loc[flip_mask] = 1 - y.loc[flip_mask].astype(int)
            else:
                current = y.loc[flip_mask].to_numpy()
                replaced = []
                for cur in current:
                    candidates = classes[classes != cur]
                    replaced.append(int(rng.choice(candidates)))
                y.loc[flip_mask] = replaced

    print('Distribution noise is enabled.')
    print('Noise mode:', NOISE_MODE)
    print('Noise applied features:', noise_applied_features)
    print('Noise segment fraction:', NOISE_SEGMENT_FRACTION)
    print('Label flipped count:', label_flipped_count)
else:
    print('Distribution noise is disabled.')

# LightGBMで安全に扱えるよう列名を正規化
def sanitize_column_name(col):
    col = str(col)
    col = re.sub(r'[^0-9a-zA-Z_]+', '_', col)
    col = re.sub(r'_+', '_', col).strip('_')
    if col == '':
        col = 'col'
    if col[0].isdigit():
        col = f'f_{col}'
    return col

sanitized_cols = [sanitize_column_name(c) for c in X.columns]
seen = {}
unique_cols = []
for c in sanitized_cols:
    seen[c] = seen.get(c, 0) + 1
    unique_cols.append(c if seen[c] == 1 else f'{c}_{seen[c]-1}')
X.columns = unique_cols

df = X.copy()
df['target'] = y

print(f'Dataset: {dataset_name}')
print(f'Full dataset shape: {df.shape}')
print('Class ratio:', y.value_counts(normalize=True).sort_index().to_dict())
print(f'Feature count after preprocessing: {X.shape[1]}')

Distribution noise is enabled.
Noise mode: segment_shift_and_label_flip
Noise applied features: ['age']
Noise segment fraction: 0.15
Label flipped count: 2223
Dataset: openml_adult_v2
Full dataset shape: (48842, 15)
Class ratio: {0: 0.736620122026125, 1: 0.26337987797387497}
Feature count after preprocessing: 14


In [74]:
train_df, test_df = train_test_split(
    df,
    test_size=TEST_SIZE,
    random_state=SEED,
    stratify=df['target']
)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f'Train shape: {train_df.shape}')
print(f'Test shape : {test_df.shape}')
print('Train class ratio:', train_df['target'].value_counts(normalize=True).sort_index().to_dict())
print('Test  class ratio:', test_df['target'].value_counts(normalize=True).sort_index().to_dict())

Train shape: (19536, 15)
Test shape : (29306, 15)
Train class ratio: {0: 0.7366400491400491, 1: 0.26335995085995084}
Test  class ratio: {0: 0.7366068381901317, 1: 0.26339316180986827}


In [75]:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
train_df.to_csv(TRAIN_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)

meta = {
    'dataset': dataset_name,
    'dataset_id': dataset_id,
    'task_type': task_type,
    'seed': SEED,
    'test_size': TEST_SIZE,
    'enable_distribution_noise': ENABLE_DISTRIBUTION_NOISE,
    'noise_mode': NOISE_MODE,
    'noise_feature_count': NOISE_FEATURE_COUNT,
    'noise_strength': NOISE_STRENGTH,
    'noise_random_seed': NOISE_RANDOM_SEED,
    'noise_segment_fraction': NOISE_SEGMENT_FRACTION,
    'noise_label_flip_rate': NOISE_LABEL_FLIP_RATE,
    'noise_applied_features': noise_applied_features,
    'label_flipped_count': label_flipped_count,
    'n_samples_total': int(df.shape[0]),
    'n_samples_train': int(train_df.shape[0]),
    'n_samples_test': int(test_df.shape[0]),
    'n_features': int(X.shape[1]),
    'feature_names': list(X.columns),
    'target_names': target_names,
    'class_labels': sorted(train_df['target'].unique().tolist())
}
META_PATH.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding='utf-8')

print(f'Saved: {TRAIN_PATH}')
print(f'Saved: {TEST_PATH}')
print(f'Saved: {META_PATH}')

Saved: data/processed/classification_train.csv
Saved: data/processed/classification_test.csv
Saved: data/processed/classification_meta.json


In [76]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass,education,marital_status,occupation,relationship,race,sex,native_country,target
0,18.0,152641,6,0,0,40,8,0,4,14,3,4,1,38,0
1,84.121177,49715,9,0,0,40,3,11,0,13,4,4,1,38,0
2,46.0,125892,13,0,1977,60,4,9,2,9,0,4,1,38,1
3,39.0,223792,9,0,0,40,3,11,2,6,0,4,1,38,1
4,23.0,99399,10,0,0,25,8,15,4,14,4,0,0,38,0


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass,education,marital_status,occupation,relationship,race,sex,native_country,target
0,26.0,44308,13,0,0,40,3,9,4,12,1,4,1,38,0
1,59.0,24244,10,0,0,40,3,15,0,7,1,4,0,38,0
2,56.0,119254,6,0,0,40,8,0,0,14,1,4,0,38,0
3,37.0,103323,9,0,0,40,3,11,4,2,1,4,1,38,0
4,66.687579,91608,15,0,0,40,3,14,2,2,0,4,1,38,0
