# TODO

- understand Neal, KEEL, UCI
- create 10-fold train-test split

In [27]:
seed_list = list(range(10))
seed_list

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Neal 

In [32]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict

class NealDatasetGenerator:
    """
    A class to generate and manage synthetic datasets based on the papers 
    by Neal (1997) and Tang et al. (2017).
    """
    def __init__(self,
                 n_train_samples: int = 100,
                 n_test_samples: int = 50,
                 target_outlier_fraction: float = 0.05,
                 input_outlier_fraction: float = 0.05,
                 seed: int = None):
        """
        Initializes the generator.

        Args:
            n_train_samples (int): The number of samples for the training data.
            n_test_samples (int): The number of samples for the test data.
            target_outlier_fraction (float): The fraction of outliers in the target variable.
            input_outlier_fraction (float): The fraction of outliers in the input variable.
            seed (int, optional): A random seed for reproducibility.
        """
        self.n_train_samples = n_train_samples
        self.n_test_samples = n_test_samples
        self.target_outlier_fraction = target_outlier_fraction
        self.input_outlier_fraction = input_outlier_fraction
        self.seed = seed
        print("NealDatasetGenerator initialized.")

    def _generate_data(self, 
                       n_samples: int,
                       target_outlier_fraction: float,
                       input_outlier_fraction: float,
                       random_seed: int) -> Dict[str, np.ndarray]:
        """Internal method to generate a single dataset."""
        if random_seed is not None:
            np.random.seed(random_seed)

        # 1. Generate true inputs from the underlying distribution
        x_true = np.random.randn(n_samples)
        
        # This will become the observed input, potentially with outliers
        x_observed = x_true.copy()

        # 2. Determine target outliers first to ensure reproducibility
        target_outlier_mask = np.random.rand(n_samples) < target_outlier_fraction
        
        # 3. Add outliers to the input variable if specified
        input_outlier_mask = np.zeros(n_samples, dtype=bool)
        if input_outlier_fraction > 0.0:
            num_input_outliers = int(n_samples * input_outlier_fraction)
            outlier_indices = np.random.choice(n_samples, num_input_outliers, replace=False)
            outlier_magnitude = 3 * np.std(x_true)
            signs = np.random.choice([-1, 1], num_input_outliers)
            x_observed[outlier_indices] += signs * outlier_magnitude
            input_outlier_mask[outlier_indices] = True

        # 4. Calculate the true mean using the (potentially outlier) observed inputs
        true_mean = 0.3 + 0.4 * x_observed + 0.5 * np.sin(2.7 * x_observed) + 1.1 / (1 + x_observed**2)
        
        # 5. Generate observation noise for the target
        noise_std_normal, noise_std_outlier = 0.1, 1.0
        noise_std = np.where(target_outlier_mask, noise_std_outlier, noise_std_normal)
        noise = np.random.normal(0, noise_std)
        
        # 6. Calculate the final observed target
        t = true_mean + noise

        # 7. Sort the data based on the observed input for easier plotting
        sort_indices = np.argsort(x_observed)
        
        return {
            "x": x_observed[sort_indices],
            "t": t[sort_indices],
            "input_mask": input_outlier_mask[sort_indices],
            "target_mask": target_outlier_mask[sort_indices]
        }

    def make_train_dataset(self, with_input_outliers: bool = False) -> Dict[str, np.ndarray]:
        """
        Generates one type of training dataset.

        Args:
            with_input_outliers (bool): If True, the dataset will contain input outliers.

        Returns:
            Dict: A dictionary containing the generated training data with keys 
                  'x', 't', 'input_mask', and 'target_mask'.
        """
        input_fraction = self.input_outlier_fraction if with_input_outliers else 0.0
        print(f"Generating Training Data (with_input_outliers={with_input_outliers})...")
        
        return self._generate_data(
            n_samples=self.n_train_samples,
            target_outlier_fraction=self.target_outlier_fraction,
            input_outlier_fraction=input_fraction,
            random_seed=self.seed
        )

    def make_test_dataset(self) -> Dict[str, np.ndarray]:
        """
        Generates the test dataset. The test data contains no outliers.

        Returns:
            Dict: A dictionary containing the generated test data with keys 
                  'x', 't', 'input_mask', and 'target_mask'.
        """
        print("Generating Test Data...")
        # Use a different seed for the test set to ensure it's independent from the training set
        test_seed = None if self.seed is None else self.seed + 10

        return self._generate_data(
            n_samples=self.n_test_samples,
            target_outlier_fraction=0.0,
            input_outlier_fraction=0.0,
            random_seed=test_seed
        )

    @staticmethod
    def plot_dataset(data: Dict[str, np.ndarray], title: str):
        """
        Visualizes a single dataset.

        Args:
            data (Dict): A dictionary in the format returned by _generate_data.
            title (str): The title for the plot.
        """
        plt.style.use('seaborn-v0_8-whitegrid')
        fig, ax = plt.subplots(figsize=(12, 7))
        
        x_data, t_data = data["x"], data["t"]
        input_mask, target_mask = data["input_mask"], data["target_mask"]
        
        # Create masks for different data point types
        normal_mask = ~input_mask & ~target_mask
        target_only_mask = ~input_mask & target_mask
        input_only_mask = input_mask & ~target_mask
        both_mask = input_mask & target_mask

        # Plot each type of data point with a different style
        ax.scatter(x_data[normal_mask], t_data[normal_mask], c='black', s=30, label='Normal Data', alpha=0.7)
        ax.scatter(x_data[target_only_mask], t_data[target_only_mask], facecolors='none', edgecolors='blue', marker='o', s=100, linewidth=1.5, label='Target Outlier')
        ax.scatter(x_data[input_only_mask], t_data[input_only_mask], c='green', marker='P', s=70, label='Input Outlier', alpha=0.8)
        ax.scatter(x_data[both_mask], t_data[both_mask], c='red', marker='*', s=150, label='Input & Target Outlier', alpha=0.8)

        # Plot the true underlying function
        x_range = np.linspace(x_data.min() - 0.2, x_data.max() + 0.2, 400)
        true_y = 0.3 + 0.4 * x_range + 0.5 * np.sin(2.7 * x_range) + 1.1 / (1 + x_range**2)
        ax.plot(x_range, true_y, 'r--', label='True Function', linewidth=2)

        ax.set_title(title, fontsize=16)
        ax.set_xlabel('Input (x)', fontsize=12)
        ax.set_ylabel('Target (t)', fontsize=12)
        ax.legend(fontsize=10)
        plt.show()


if __name__ == '__main__':
    # --- Example Usage ---
    
    # 1. Create an instance of the generator
    generator = NealDatasetGenerator(seed=2024)

    # 2. Generate required datasets individually
    
    # Training data (target outliers only)
    train_set_1 = generator.make_train_dataset(with_input_outliers=False)
    
    # Training data (both input and target outliers)
    train_set_2 = generator.make_train_dataset(with_input_outliers=True)

    # Test data
    test_set = generator.make_test_dataset()

    # # 3. Check the information of the generated data
    # print("\n--- Data Generation Summary ---")
    # print(f"Train set 1 (target outliers only): x shape {train_set_1['x'].shape}, {np.sum(train_set_1['target_mask'])} target outliers.")
    # print(f"Train set 2 (all outliers): x shape {train_set_2['x'].shape}, {np.sum(train_set_2['input_mask'])} input outliers.")
    # print(f"Test set (no outliers): x shape {test_set['x'].shape}\n")
    
    # # 4. Plot specific datasets for verification
    # NealDatasetGenerator.plot_dataset(train_set_1, "Training Data (Target Outliers Only)")
    # NealDatasetGenerator.plot_dataset(train_set_2, "Training Data (Input & Target Outliers)")
    # NealDatasetGenerator.plot_dataset(test_set, "Test Data (No Outliers)")

NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=False)...
Generating Training Data (with_input_outliers=True)...
Generating Test Data...


In [34]:
import os

save_path_base = f"./../datasets/Neal"

for seed in seed_list:

    save_path = os.path.join(save_path_base, f"split_{seed}")

    generator = NealDatasetGenerator(seed=seed)
    
    # Generate datasets
    train_set = generator.make_train_dataset(with_input_outliers=False)
    test_set = generator.make_test_dataset()
    
    train_x = train_set['x'].reshape(-1, 1)
    train_t = train_set['t'].reshape(-1, 1)
    test_x = test_set['x'].reshape(-1, 1)
    test_t = test_set['t'].reshape(-1, 1)

    # Save datasets as csv
    os.makedirs(save_path, exist_ok=True)
    np.savetxt(os.path.join(save_path, "train_features.csv"), train_x, delimiter=",")
    np.savetxt(os.path.join(save_path, "train_target.csv"), train_t, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_features.csv"), test_x, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_target.csv"), test_t, delimiter=",")
    print(f"Data saved to {save_path}")

NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=False)...
Generating Test Data...
Data saved to ./../datasets/Neal/split_0
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=False)...
Generating Test Data...
Data saved to ./../datasets/Neal/split_1
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=False)...
Generating Test Data...
Data saved to ./../datasets/Neal/split_2
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=False)...
Generating Test Data...
Data saved to ./../datasets/Neal/split_3
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=False)...
Generating Test Data...
Data saved to ./../datasets/Neal/split_4
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=False)...
Generating Test Data...
Data saved to ./../datasets/Neal/split_5
NealDatasetGenerator initialized.
Generating Training Data (with_input

In [35]:
import os

save_path_base = f"./../datasets/Neal_XOutlier"

for seed in seed_list:

    save_path = os.path.join(save_path_base, f"split_{seed}")

    generator = NealDatasetGenerator(seed=seed)
    
    # Generate datasets
    train_set = generator.make_train_dataset(with_input_outliers=True)
    test_set = generator.make_test_dataset()
    
    train_x = train_set['x'].reshape(-1, 1)
    train_t = train_set['t'].reshape(-1, 1)
    test_x = test_set['x'].reshape(-1, 1)
    test_t = test_set['t'].reshape(-1, 1)

    # Save datasets as csv
    os.makedirs(save_path, exist_ok=True)
    np.savetxt(os.path.join(save_path, "train_features.csv"), train_x, delimiter=",")
    np.savetxt(os.path.join(save_path, "train_target.csv"), train_t, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_features.csv"), test_x, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_target.csv"), test_t, delimiter=",")
    print(f"Data saved to {save_path}")

NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=True)...
Generating Test Data...
Data saved to ./../datasets/Neal_XOutlier/split_0
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=True)...
Generating Test Data...
Data saved to ./../datasets/Neal_XOutlier/split_1
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=True)...
Generating Test Data...
Data saved to ./../datasets/Neal_XOutlier/split_2
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=True)...
Generating Test Data...
Data saved to ./../datasets/Neal_XOutlier/split_3
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=True)...
Generating Test Data...
Data saved to ./../datasets/Neal_XOutlier/split_4
NealDatasetGenerator initialized.
Generating Training Data (with_input_outliers=True)...
Generating Test Data...
Data saved to ./../datasets/Neal_XOutlier/split_5
NealDatasetGenerator i

In [36]:
import pandas as pd

temp = os.path.join(save_path_base, f"split_{seed_list[0]}", "train_features.csv")
df = pd.read_csv(temp, header=None)
df

Unnamed: 0,0
0,-3.210831
1,-2.552990
2,-2.370028
3,-1.980796
4,-1.726283
...,...
95,1.895889
96,1.950775
97,2.240893
98,2.269755


In [39]:
df.dtypes

0    float64
dtype: object

# KEEL

In [45]:
import io
import os
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import KFold
import numpy as np

file_path = "./../data/diabetes.dat"
save_path_base = f"./../datasets/Diabetes"

# 1. ファイルから全ての行をリストとして読み込む
with open(file_path, 'r') as f:
    lines = f.readlines()

filtered_lines = [
    line for line in lines 
    if not line.strip().startswith(("@"))
]

# 1. リストを単一の文字列に結合
data_as_string = "".join(filtered_lines)

# 2. 文字列をファイルのように扱えるように変換
string_io = io.StringIO(data_as_string)

# 3. pandas.read_csvで読み込み、DataFrameを作成
# header=None は、ファイルにヘッダー行がないことを示す
df = pd.read_csv(string_io, header=None)

# 4. (推奨) DataFrameに列名を設定する
df.columns = ['Age', 'Deficit', 'C_peptide']


# 特徴量とターゲットを分離
X = df[['Age', 'Deficit']].values
y = df['C_peptide'].values

# 10分割交差検証の設定
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) # shuffle=True と random_state を指定して再現性を確保

# 各分割でのデータを格納するリスト (オプション)
# train_features_list = []
# train_target_list = []
# test_features_list = []
# test_target_list = []

print(f"Performing {n_splits}-fold cross-validation:\n")
for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}:")

    train_features = X[train_index]
    train_target = y[train_index]
    test_features = X[test_index]
    test_target = y[test_index]

    # csv として保存
    save_path = os.path.join(save_path_base, f"split_{fold}")

    print(save_path)
    os.makedirs(save_path, exist_ok=True)
    np.savetxt(os.path.join(save_path, "train_features.csv"), train_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "train_target.csv"), train_target, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_features.csv"), test_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_target.csv"), test_target, delimiter=",")

Performing 10-fold cross-validation:

Fold 1:
./../datasets/Diabetes/split_0
Fold 2:
./../datasets/Diabetes/split_1
Fold 3:
./../datasets/Diabetes/split_2
Fold 4:
./../datasets/Diabetes/split_3
Fold 5:
./../datasets/Diabetes/split_4
Fold 6:
./../datasets/Diabetes/split_5
Fold 7:
./../datasets/Diabetes/split_6
Fold 8:
./../datasets/Diabetes/split_7
Fold 9:
./../datasets/Diabetes/split_8
Fold 10:
./../datasets/Diabetes/split_9


In [46]:
import io
import os
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import KFold
import numpy as np

file_path = "./../data/machineCPU.dat"
save_path_base = f"./../datasets/Machine CPU"

# 1. ファイルから全ての行をリストとして読み込む
with open(file_path, 'r') as f:
    lines = f.readlines()

filtered_lines = [
    line for line in lines 
    if not line.strip().startswith(("@"))
]

# 1. リストを単一の文字列に結合
data_as_string = "".join(filtered_lines)

# 2. 文字列をファイルのように扱えるように変換
string_io = io.StringIO(data_as_string)

# 3. pandas.read_csvで読み込み、DataFrameを作成
# header=None は、ファイルにヘッダー行がないことを示す
df = pd.read_csv(string_io, header=None)

# 4. (推奨) DataFrameに列名を設定する
df.columns = ["MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "PRP"]


# 特徴量とターゲットを分離
X = df[["MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX"]].values
y = df["PRP"].values

# 10分割交差検証の設定
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) # shuffle=True と random_state を指定して再現性を確保

# 各分割でのデータを格納するリスト (オプション)
# train_features_list = []
# train_target_list = []
# test_features_list = []
# test_target_list = []

print(f"Performing {n_splits}-fold cross-validation:\n")
for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}:")

    train_features = X[train_index]
    train_target = y[train_index]
    test_features = X[test_index]
    test_target = y[test_index]

    # csv として保存
    save_path = os.path.join(save_path_base, f"split_{fold}")

    print(save_path)
    os.makedirs(save_path, exist_ok=True)
    np.savetxt(os.path.join(save_path, "train_features.csv"), train_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "train_target.csv"), train_target, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_features.csv"), test_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_target.csv"), test_target, delimiter=",")

Performing 10-fold cross-validation:

Fold 1:
./../datasets/Machine CPU/split_0
Fold 2:
./../datasets/Machine CPU/split_1
Fold 3:
./../datasets/Machine CPU/split_2
Fold 4:
./../datasets/Machine CPU/split_3
Fold 5:
./../datasets/Machine CPU/split_4
Fold 6:
./../datasets/Machine CPU/split_5
Fold 7:
./../datasets/Machine CPU/split_6
Fold 8:
./../datasets/Machine CPU/split_7
Fold 9:
./../datasets/Machine CPU/split_8
Fold 10:
./../datasets/Machine CPU/split_9


In [47]:
import io
import os
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import KFold
import numpy as np

file_path = "./../data/ele-1.dat"
save_path_base = f"./../datasets/ELE"

# 1. ファイルから全ての行をリストとして読み込む
with open(file_path, 'r') as f:
    lines = f.readlines()

filtered_lines = [
    line for line in lines 
    if not line.strip().startswith(("@"))
]

# 1. リストを単一の文字列に結合
data_as_string = "".join(filtered_lines)

# 2. 文字列をファイルのように扱えるように変換
string_io = io.StringIO(data_as_string)

# 3. pandas.read_csvで読み込み、DataFrameを作成
# header=None は、ファイルにヘッダー行がないことを示す
df = pd.read_csv(string_io, header=None)

# 4. (推奨) DataFrameに列名を設定する
df.columns = ["Inhabitants", "Distance", "Length"]


# 特徴量とターゲットを分離
X = df[["Inhabitants", "Distance"]].values
y = df["Length"].values

# 10分割交差検証の設定
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) # shuffle=True と random_state を指定して再現性を確保

# 各分割でのデータを格納するリスト (オプション)
# train_features_list = []
# train_target_list = []
# test_features_list = []
# test_target_list = []

print(f"Performing {n_splits}-fold cross-validation:\n")
for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}:")

    train_features = X[train_index]
    train_target = y[train_index]
    test_features = X[test_index]
    test_target = y[test_index]

    # csv として保存
    save_path = os.path.join(save_path_base, f"split_{fold}")

    print(save_path)
    os.makedirs(save_path, exist_ok=True)
    np.savetxt(os.path.join(save_path, "train_features.csv"), train_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "train_target.csv"), train_target, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_features.csv"), test_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_target.csv"), test_target, delimiter=",")

Performing 10-fold cross-validation:

Fold 1:
./../datasets/ELE/split_0
Fold 2:
./../datasets/ELE/split_1
Fold 3:
./../datasets/ELE/split_2
Fold 4:
./../datasets/ELE/split_3
Fold 5:
./../datasets/ELE/split_4
Fold 6:
./../datasets/ELE/split_5
Fold 7:
./../datasets/ELE/split_6
Fold 8:
./../datasets/ELE/split_7
Fold 9:
./../datasets/ELE/split_8
Fold 10:
./../datasets/ELE/split_9


# UCI

In [50]:
import os
import numpy as np
from sklearn.model_selection import KFold
from ucimlrepo import fetch_ucirepo

# 1. データセットの取得
auto_mpg = fetch_ucirepo(id=9)
X_df = auto_mpg.data.features
y_df = auto_mpg.data.targets # これは 'mpg' 列を含むDataFrameです

# 2. 'horsepower' 列の欠損値処理
# 'horsepower' が欠損値でない行のマスクを作成
not_nan_mask = X_df['horsepower'].notna()

# マスクを特徴量とターゲットの両方のDataFrameに適用
X_clean_df = X_df[not_nan_mask]
y_clean_df = y_df[not_nan_mask]

# 3. NumPy配列への変換
X_np = X_clean_df.values
y_np = y_clean_df.values

# 4. 保存先の設定
save_path_base = f"./../datasets/MPG" # データセット名をMPGに変更

# 5. 10分割交差検証の設定
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) # 再現性のためにrandom_stateを設定

print(f"Performing {n_splits}-fold cross-validation for MPG dataset:\n")
for fold, (train_index, test_index) in enumerate(kf.split(X_np, y_np)):
    print(f"Fold {fold+1}:")

    train_features = X_np[train_index]
    train_target = y_np[train_index]
    test_features = X_np[test_index]
    test_target = y_np[test_index]

    print(train_features.shape, train_target.shape, test_features.shape, test_target.shape)

    # CSVとして保存
    save_path = os.path.join(save_path_base, f"split_{fold}")

    print(f"Saving to {save_path}")
    os.makedirs(save_path, exist_ok=True) # split_{fold} ディレクトリを作成
    np.savetxt(os.path.join(save_path, "train_features.csv"), train_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "train_target.csv"), train_target, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_features.csv"), test_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_target.csv"), test_target, delimiter=",")

print("\nMPG dataset processing complete.")

Performing 10-fold cross-validation for MPG dataset:

Fold 1:
(352, 7) (352, 1) (40, 7) (40, 1)
Saving to ./../datasets/MPG/split_0
Fold 2:
(352, 7) (352, 1) (40, 7) (40, 1)
Saving to ./../datasets/MPG/split_1
Fold 3:
(353, 7) (353, 1) (39, 7) (39, 1)
Saving to ./../datasets/MPG/split_2
Fold 4:
(353, 7) (353, 1) (39, 7) (39, 1)
Saving to ./../datasets/MPG/split_3
Fold 5:
(353, 7) (353, 1) (39, 7) (39, 1)
Saving to ./../datasets/MPG/split_4
Fold 6:
(353, 7) (353, 1) (39, 7) (39, 1)
Saving to ./../datasets/MPG/split_5
Fold 7:
(353, 7) (353, 1) (39, 7) (39, 1)
Saving to ./../datasets/MPG/split_6
Fold 8:
(353, 7) (353, 1) (39, 7) (39, 1)
Saving to ./../datasets/MPG/split_7
Fold 9:
(353, 7) (353, 1) (39, 7) (39, 1)
Saving to ./../datasets/MPG/split_8
Fold 10:
(353, 7) (353, 1) (39, 7) (39, 1)
Saving to ./../datasets/MPG/split_9

MPG dataset processing complete.


In [56]:
import os
import numpy as np
from sklearn.model_selection import KFold
from ucimlrepo import fetch_ucirepo

# 1. データセットの取得
concrete = fetch_ucirepo(id=165)
X_np = concrete.data.features.values
y_np = concrete.data.targets.values

# 4. 保存先の設定
save_path_base = f"./../datasets/Concrete" # データセット名をMPGに変更

# 5. 10分割交差検証の設定
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) # 再現性のためにrandom_stateを設定

print(f"Performing {n_splits}-fold cross-validation for MPG dataset:\n")
for fold, (train_index, test_index) in enumerate(kf.split(X_np, y_np)):
    print(f"Fold {fold+1}:")

    train_features = X_np[train_index]
    train_target = y_np[train_index]
    test_features = X_np[test_index]
    test_target = y_np[test_index]

    print(train_features.shape, train_target.shape, test_features.shape, test_target.shape)

    # 更にここから，訓練データは 320 インスタンス，テストデータは 80 インスタンスのみ抽出する
    train_features = train_features[:320]
    train_target = train_target[:320]
    test_features = test_features[:80]
    test_target = test_target[:80]

    print(train_features.shape, train_target.shape, test_features.shape, test_target.shape)

    # CSVとして保存
    save_path = os.path.join(save_path_base, f"split_{fold}")

    print(f"Saving to {save_path}")
    os.makedirs(save_path, exist_ok=True) # split_{fold} ディレクトリを作成
    np.savetxt(os.path.join(save_path, "train_features.csv"), train_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "train_target.csv"), train_target, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_features.csv"), test_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_target.csv"), test_target, delimiter=",")

print("\nMPG dataset processing complete.")

Performing 10-fold cross-validation for MPG dataset:

Fold 1:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_0
Fold 2:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_1
Fold 3:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_2
Fold 4:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_3
Fold 5:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_4
Fold 6:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_5
Fold 7:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_6
Fold 8:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/sp

In [73]:
import os
import numpy as np
from sklearn.model_selection import KFold
from ucimlrepo import fetch_ucirepo

# 1. データセットの取得
concrete = fetch_ucirepo(id=165)
X_np = concrete.data.features.values
y_np = concrete.data.targets.values

# 4. 保存先の設定
save_path_base = f"./../datasets/Concrete" # データセット名をMPGに変更

# 5. 10分割交差検証の設定
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) # 再現性のためにrandom_stateを設定

print(f"Performing {n_splits}-fold cross-validation for Concrete dataset:\n")
for fold, (train_index, test_index) in enumerate(kf.split(X_np, y_np)):
    print(f"Fold {fold+1}:")

    train_features = X_np[train_index]
    train_target = y_np[train_index]
    test_features = X_np[test_index]
    test_target = y_np[test_index]

    print(train_features.shape, train_target.shape, test_features.shape, test_target.shape)

    # 更にここから，訓練データは 320 インスタンス，テストデータは 80 インスタンスのみ抽出する
    train_features = train_features[:320]
    train_target = train_target[:320]
    test_features = test_features[:80]
    test_target = test_target[:80]

    print(train_features.shape, train_target.shape, test_features.shape, test_target.shape)

    # CSVとして保存
    save_path = os.path.join(save_path_base, f"split_{fold}")

    print(f"Saving to {save_path}")
    os.makedirs(save_path, exist_ok=True) # split_{fold} ディレクトリを作成
    np.savetxt(os.path.join(save_path, "train_features.csv"), train_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "train_target.csv"), train_target, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_features.csv"), test_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_target.csv"), test_target, delimiter=",")

print("\nConcrete dataset processing complete.")

Performing 10-fold cross-validation for Concrete dataset:

Fold 1:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_0
Fold 2:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_1
Fold 3:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_2
Fold 4:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_3
Fold 5:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_4
Fold 6:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_5
Fold 7:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concrete/split_6
Fold 8:
(927, 8) (927, 1) (103, 8) (103, 1)
(320, 8) (320, 1) (80, 8) (80, 1)
Saving to ./../datasets/Concre

In [83]:
import os
import numpy as np
from sklearn.model_selection import KFold
from ucimlrepo import fetch_ucirepo

# 1. データセットの取得
# Capital Bike Sharing (hourly＋daily rental counts)
bike_sharing = fetch_ucirepo(id=275)

features_selected = bike_sharing.data.original.drop(columns=["dteday"])
target = bike_sharing.data.original["cnt"]

X_np = features_selected.values
y_np = target.values.reshape(-1, 1)  # ターゲットを2次元配列に変換

# 4. 保存先の設定
save_path_base = f"./../datasets/Bike" # データセット名をMPGに変更

# 5. 10分割交差検証の設定
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) # 再現性のためにrandom_stateを設定

print(f"Performing {n_splits}-fold cross-validation for Bike dataset:\n")
for fold, (train_index, test_index) in enumerate(kf.split(X_np, y_np)):
    print(f"Fold {fold+1}:")

    train_features = X_np[train_index]
    train_target = y_np[train_index]
    test_features = X_np[test_index]
    test_target = y_np[test_index]

    print(train_features.shape, train_target.shape, test_features.shape, test_target.shape)

    # 更にここから，訓練データは 320 インスタンス，テストデータは 80 インスタンスのみ抽出する
    train_features = train_features[:320]
    train_target = train_target[:320]
    test_features = test_features[:80]
    test_target = test_target[:80]

    print(train_features.shape, train_target.shape, test_features.shape, test_target.shape)

    # CSVとして保存
    save_path = os.path.join(save_path_base, f"split_{fold}")

    print(f"Saving to {save_path}")
    os.makedirs(save_path, exist_ok=True) # split_{fold} ディレクトリを作成
    np.savetxt(os.path.join(save_path, "train_features.csv"), train_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "train_target.csv"), train_target, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_features.csv"), test_features, delimiter=",")
    np.savetxt(os.path.join(save_path, "test_target.csv"), test_target, delimiter=",")

print("\nBike dataset processing complete.")

Performing 10-fold cross-validation for Bike dataset:

Fold 1:
(15641, 16) (15641, 1) (1738, 16) (1738, 1)
(320, 16) (320, 1) (80, 16) (80, 1)
Saving to ./../datasets/Bike/split_0
Fold 2:
(15641, 16) (15641, 1) (1738, 16) (1738, 1)
(320, 16) (320, 1) (80, 16) (80, 1)
Saving to ./../datasets/Bike/split_1
Fold 3:
(15641, 16) (15641, 1) (1738, 16) (1738, 1)
(320, 16) (320, 1) (80, 16) (80, 1)
Saving to ./../datasets/Bike/split_2
Fold 4:
(15641, 16) (15641, 1) (1738, 16) (1738, 1)
(320, 16) (320, 1) (80, 16) (80, 1)
Saving to ./../datasets/Bike/split_3
Fold 5:
(15641, 16) (15641, 1) (1738, 16) (1738, 1)
(320, 16) (320, 1) (80, 16) (80, 1)
Saving to ./../datasets/Bike/split_4
Fold 6:
(15641, 16) (15641, 1) (1738, 16) (1738, 1)
(320, 16) (320, 1) (80, 16) (80, 1)
Saving to ./../datasets/Bike/split_5
Fold 7:
(15641, 16) (15641, 1) (1738, 16) (1738, 1)
(320, 16) (320, 1) (80, 16) (80, 1)
Saving to ./../datasets/Bike/split_6
Fold 8:
(15641, 16) (15641, 1) (1738, 16) (1738, 1)
(320, 16) (320, 1)

# Data Loader

In [3]:
# Dataset loader module for CSV splits
import os
import numpy as np


class DatasetLoader:

    def __init__(self, base_path):
        """
        Initializes the DatasetLoader with the base path where datasets are stored.
        
        Parameters:
            base_path (str): Root path where datasets/ lives.
        """
        self.base_path = base_path

    def list_datasets(self):
        """
        Returns a list of dataset names under the base_path directory.
        """
        return [name for name in os.listdir(self.base_path)
                if os.path.isdir(os.path.join(self.base_path, name))]

    def list_splits(self, dataset_name):
        """
        Returns a sorted list of split indices available for the given dataset.
        """
        ds_path = os.path.join(self.base_path, dataset_name)
        splits = []
        for name in os.listdir(ds_path):
            if name.startswith("split_"):
                try:
                    idx = int(name.split("_", 1)[1])
                    splits.append(idx)
                except ValueError:
                    pass
        return sorted(splits)

    def load_split(self, dataset_name, split_index):
        """
        Loads train/test features and targets for a specific split.
        
        Parameters:
            base_path (str): Root path where datasets/ lives.
            dataset_name (str): Name of the dataset folder.
            split_index (int): Split number (e.g., 0, 1, ...).
        
        Returns:
            X_train (ndarray), y_train (ndarray), X_test (ndarray), y_test (ndarray)
        """
        split_dir = os.path.join(self.base_path, dataset_name, f"split_{split_index}")
        X_train = np.loadtxt(os.path.join(split_dir, "train_features.csv"), delimiter=",")
        y_train = np.loadtxt(os.path.join(split_dir, "train_target.csv"),  delimiter=",")
        X_test  = np.loadtxt(os.path.join(split_dir, "test_features.csv"),  delimiter=",")
        y_test  = np.loadtxt(os.path.join(split_dir, "test_target.csv"),   delimiter=",")
        return X_train, y_train, X_test, y_test



base = "./../datasets"

loader = DatasetLoader(base)

# 利用可能なデータセット
print(loader.list_datasets())
# -> ['Bike', 'Concrete', 'Diabetes', ...]

# 'Bike' データセットの利用可能な分割
print(loader.list_splits("Bike"))
# -> [0, 1, 2, ..., 9]

# split 0 をロード
X_tr, y_tr, X_te, y_te = loader.load_split("Bike", 0)
print(X_tr.shape, y_tr.shape, X_te.shape, y_te.shape)

['Neal', 'Bike', 'Concrete', 'Machine CPU', 'ELE', 'Diabetes', 'MPG', 'Neal_XOutlier']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
(320, 16) (320,) (80, 16) (80,)
