In [None]:
# ! kaggle competitions download -c new-york-city-taxi-fare-prediction

In [5]:
import os
import pandas as pd
from sklearn.datasets import fetch_openml
import zipfile

def download_and_save_datasets(output_dir='temp'):
    """
    Downloads, processes, and saves the datasets used in the
    "Sparse Variational Student-t Processes" paper.
    """
    # --- Configuration ---
    # Create a directory to store the datasets
    os.makedirs(output_dir, exist_ok=True)
    print(f"Datasets will be saved in the '{output_dir}/' directory.")

    # --- 1. Concrete Slump Test Data ---
    try:
        print("\n[1/8] Downloading Concrete Data...")
        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data"
        # The file is comma-separated and has a header on the first line
        df = pd.read_csv(url)
        # Clean up column names by removing extra spaces
        df.columns = df.columns.str.strip()
        df.to_csv(os.path.join(output_dir, 'concrete_slump.csv'), index=False)
        print(" -> Success: Saved concrete_slump.csv")
    except Exception as e:
        print(f" -> Failed to download Concrete data. Error: {e}")

    # --- 2. Boston Housing Data ---
    try:
        print("\n[2/8] Downloading Boston Housing Data...")
        # The original UCI dataset is deprecated. We fetch a processed version from OpenML.
        boston = fetch_openml(name='boston', version=1, as_frame=True, parser='liac-arff')
        df_boston = boston.frame
        df_boston.to_csv(os.path.join(output_dir, 'boston_housing.csv'), index=False)
        print(" -> Success: Saved boston_housing.csv")
        print(" -> Note: The Boston Housing dataset has known ethical concerns.")
    except Exception as e:
        print(f" -> Failed to download Boston Housing data. Error: {e}")

    # --- 3. Kin8nm Data ---
    try:
        print("\n[3/8] Downloading Kin8nm Data...")
        # This dataset is available on OpenML
        kin8nm = fetch_openml(name='kin8nm', version=1, as_frame=True, parser='liac-arff')
        df_kin8nm = kin8nm.frame
        df_kin8nm.to_csv(os.path.join(output_dir, 'kin8nm.csv'), index=False)
        print(" -> Success: Saved kin8nm.csv")
    except Exception as e:
        print(f" -> Failed to download Kin8nm data. Error: {e}")

    # --- 4. Yacht Hydrodynamics Data ---
    try:
        print("\n[4/8] Downloading Yacht Hydrodynamics Data...")
        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data"
        # The data is space-separated and has no header
        column_names = [
            'longitudinal_pos', 'prismatic_coeff', 'length_displacement_ratio',
            'beam_draught_ratio', 'length_beam_ratio', 'froude_number',
            'residuary_resistance'
        ]
        df = pd.read_csv(url, delim_whitespace=True, header=None, names=column_names)
        df.to_csv(os.path.join(output_dir, 'yacht_hydrodynamics.csv'), index=False)
        print(" -> Success: Saved yacht_hydrodynamics.csv")
    except Exception as e:
        print(f" -> Failed to download Yacht data. Error: {e}")

    # --- 5. Energy Efficiency Data ---
    try:
        print("\n[5/8] Downloading Energy Efficiency Data...")
        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
        # The data is in an Excel file
        df = pd.read_excel(url, engine='openpyxl')
        df.to_csv(os.path.join(output_dir, 'energy_efficiency.csv'), index=False)
        print(" -> Success: Saved energy_efficiency.csv")
    except Exception as e:
        print(f" -> Failed to download Energy Efficiency data. Error: {e}")

    # --- 6. Elevators Data ---
    try:
        print("\n[6/8] Downloading Elevators Data...")
        # This dataset is available on OpenML
        elevators = fetch_openml(name='elevators', version=1, as_frame=True, parser='liac-arff')
        df_elevators = elevators.frame
        df_elevators.to_csv(os.path.join(output_dir, 'elevators.csv'), index=False)
        print(" -> Success: Saved elevators.csv")
    except Exception as e:
        print(f" -> Failed to download Elevators data. Error: {e}")

    # --- 7. Protein Tertiary Structure Data ---
    try:
        print("\n[7/8] Downloading Protein Structure Data...")
        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv"
        # The data is in a CSV file with a header
        df = pd.read_csv(url)
        df.to_csv(os.path.join(output_dir, 'protein_structure.csv'), index=False)
        print(" -> Success: Saved protein_structure.csv")
    except Exception as e:
        print(f" -> Failed to download Protein Structure data. Error: {e}")

    # --- 8. Taxi Trip Fare Data (Kaggle) ---
    print("\n[8/8] Instructions for Taxi Trip Fare Data:")
    print(" -> This dataset is from the 'New York City Taxi Fare Prediction' Kaggle competition.")
    print(" -> Due to its size, it must be downloaded using the Kaggle API.")
    print(" -> Instructions:")
    print("    1. Install the Kaggle library: pip install kaggle")
    print("    2. Go to your Kaggle account, 'Settings' page, and click 'Create New Token'.")
    print("    3. Place the downloaded 'kaggle.json' file in the required location (e.g., '~/.kaggle/').")
    print("    4. Uncomment and run the code block below in a separate script or cell.")

    # --- CODE TO DOWNLOAD TAXI DATA (run separately) ---
    try:
        import kaggle
        print("\nAttempting to download Taxi Fare data via Kaggle API...")
        # Authenticate with kaggle.json
        kaggle.api.authenticate()
        # Download the dataset files
        kaggle.api.competition_download_files(
            'new-york-city-taxi-fare-prediction',
            path=output_dir,
            quiet=False
        )
        # Unzip the downloaded file
        zip_path = os.path.join(output_dir, 'new-york-city-taxi-fare-prediction.zip')
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)
        os.remove(zip_path) # Clean up the zip file
        print(f" -> Success: Taxi data downloaded and extracted to '{output_dir}/'")
    except Exception as e:
        print(f" -> Kaggle API download failed. Please ensure 'kaggle.json' is set up correctly. Error: {e}")

    print("\n\nAll tasks complete.")

if __name__ == '__main__':
    dataset_dir = "temp"
    download_and_save_datasets(output_dir=dataset_dir)

Datasets will be saved in the 'svtp_datasets/' directory.

[1/8] Downloading Concrete Data...
 -> Success: Saved concrete_slump.csv

[2/8] Downloading Boston Housing Data...
 -> Success: Saved boston_housing.csv
 -> Note: The Boston Housing dataset has known ethical concerns.

[3/8] Downloading Kin8nm Data...
 -> Success: Saved kin8nm.csv

[4/8] Downloading Yacht Hydrodynamics Data...


  df = pd.read_csv(url, delim_whitespace=True, header=None, names=column_names)


 -> Success: Saved yacht_hydrodynamics.csv

[5/8] Downloading Energy Efficiency Data...
 -> Success: Saved energy_efficiency.csv

[6/8] Downloading Elevators Data...
 -> Success: Saved elevators.csv

[7/8] Downloading Protein Structure Data...
 -> Success: Saved protein_structure.csv

[8/8] Instructions for Taxi Trip Fare Data:
 -> This dataset is from the 'New York City Taxi Fare Prediction' Kaggle competition.
 -> Due to its size, it must be downloaded using the Kaggle API.
 -> Instructions:
    1. Install the Kaggle library: pip install kaggle
    2. Go to your Kaggle account, 'Settings' page, and click 'Create New Token'.
    3. Place the downloaded 'kaggle.json' file in the required location (e.g., '~/.kaggle/').
    4. Uncomment and run the code block below in a separate script or cell.

Attempting to download Taxi Fare data via Kaggle API...
Downloading new-york-city-taxi-fare-prediction.zip to svtp_datasets


100%|██████████| 1.56G/1.56G [00:00<00:00, 3.68GB/s]



 -> Success: Taxi data downloaded and extracted to 'svtp_datasets/'


All tasks complete.


In [None]:
import os
import pandas as pd

dataset_dir = 'temp'

dataset_dict = {
    "Boston": "boston_housing.csv",
    "Concrete": "concrete_slump.csv",
    "Kin8nm": "kin8nm.csv",
    "Yacht": "yacht_hydrodynamics.csv",
    "Energy": "energy_efficiency.csv",
    "Elevators": "elevators.csv",
    "Protein": "protein_structure.csv",
    "Taxi": "train.csv"  # Assuming you have downloaded the taxi data manually
}

for dataset_name, filename in dataset_dict.items():
    file_path = os.path.join(dataset_dir, filename)
    if os.path.exists(file_path):
        print(f"{dataset_name} dataset found at: {file_path}")
        
        # 行数をカウント（ヘッダー含む）
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                line_count = sum(1 for _ in f)
            print(f"{dataset_name} total lines (including header): {line_count}")
        except Exception as e:
            print(f"Error counting lines in {dataset_name} dataset: {e}")
    else:
        print(f"{dataset_name} dataset not found. Please ensure it has been downloaded and saved correctly.")
        continue  # ファイルがなければ次へ

    try:
        df = pd.read_csv(file_path, nrows=10)  # 最初の10行だけ読み込み
        print(f"name: {dataset_name}, shape: {df.shape}, columns: {list(df.columns)}")
        print(df.head())
    except Exception as e:
        print(f"Error reading {dataset_name} dataset: {e}")

    print()
    print()


In [7]:
import pandas as pd
from sklearn.model_selection import KFold
import os

def split_and_save_dataset(source_file_path, output_dir, dataset_name, target_column, n_splits=10):
    """
    Loads a single dataset, performs k-fold cross-validation splitting, and saves
    the train/test sets for each fold into a structured directory.

    Args:
        source_file_path (str): The full path to the source dataset CSV file.
        output_dir (str): The root directory where the split data will be saved.
        dataset_name (str): The name for the dataset's subdirectory (e.g., 'Concrete').
        target_column (str): The name of the column to be used as the target/label (y).
        n_splits (int): The number of folds to create.
    """
    # --- 1. Create Output Directory for the specific dataset ---
    dataset_dir = os.path.join(output_dir, dataset_name)
    os.makedirs(dataset_dir, exist_ok=True)
    print(f"Directory for '{dataset_name}' is ready at '{dataset_dir}'")

    # --- 2. Load the Dataset ---
    print(f"--> Attempting to load data from: '{source_file_path}'")
    try:
        df = pd.read_csv(source_file_path)
        print(f"--> Successfully loaded with {len(df)} rows.")
    except FileNotFoundError:
        print(f"--> [ERROR] File Not Found: The file '{source_file_path}' does not exist.")
        print(f"--> SKIPPING '{dataset_name}'.\n")
        return
    except Exception as e:
        print(f"--> [ERROR] Could not read the file. Reason: {e}")
        print(f"--> SKIPPING '{dataset_name}'.\n")
        return

    # --- 3. Clean and Separate Data ---
    original_columns = list(df.columns)
    df.columns = [str(col).strip().lower().replace(' ', '_').replace('(', '').replace(')', '').replace('.', '') for col in df.columns]
    standardized_target = target_column.strip().lower().replace(' ', '_').replace('(', '').replace(')', '').replace('.', '')

    if standardized_target not in df.columns:
        print(f"--> [ERROR] Target column '{target_column}' (standardized to '{standardized_target}') was not found.")
        print(f"--> Original columns were: {original_columns}")
        print(f"--> Standardized columns are: {list(df.columns)}")
        print(f"--> SKIPPING '{dataset_name}'.\n")
        return

    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
            
    X = df.drop(columns=[standardized_target])
    y = df[standardized_target]

    # --- 4. Set up and Perform K-Fold Cross-Validation ---
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    print(f"--> Creating {n_splits} splits for '{dataset_name}'...")
    fold_number = 0
    for train_index, test_index in kf.split(X):
        split_dir = os.path.join(dataset_dir, f'split_{fold_number}')
        os.makedirs(split_dir, exist_ok=True)

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        X_train.to_csv(os.path.join(split_dir, 'train_features.csv'), index=False)
        y_train.to_csv(os.path.join(split_dir, 'train_target.csv'), index=False, header=True)
        X_test.to_csv(os.path.join(split_dir, 'test_features.csv'), index=False)
        y_test.to_csv(os.path.join(split_dir, 'test_target.csv'), index=False, header=True)
        
        fold_number += 1
    print(f"--> Finished saving {n_splits} splits for '{dataset_name}'.\n")


if __name__ == '__main__':
    # --- Configuration ---
    # The paper used 5-fold cross-validation. We are using 10 splits
    # to match your requested directory structure.
    num_splits = 5
    
    # Set the directory where your source CSV files are located.
    source_data_dir = 'temp'  
    
    # Set the directory where you want to save the final structured dataset.
    output_data_dir = 'dataset_xu_2024'

    # Dictionary mapping dataset names to their filenames and target columns.
    datasets_to_process = {
        'Concrete': ('concrete_slump.csv', 'SLUMP(cm)'),
        'Boston': ('boston_housing.csv', 'MEDV'),
        'Kin8nm': ('kin8nm.csv', 'y'),
        'Yacht': ('yacht_hydrodynamics.csv', 'residuary_resistance'),
        'Energy': ('energy_efficiency.csv', 'Y1'),
        'Elevators': ('elevators.csv', 'Goal'),
        'Protein': ('protein_structure.csv', 'RMSD'),
        'Taxi': ('train.csv', 'fare_amount') 
    }
    
    # --- Main Loop ---
    for name, (filename, target) in datasets_to_process.items():
        print("="*50)
        print(f"PROCESSING DATASET: {name}")
        print("="*50)
        
        # Construct the full path to the source file for each dataset
        full_source_path = os.path.join(source_data_dir, filename)
        
        split_and_save_dataset(
            source_file_path=full_source_path,
            output_dir=output_data_dir,
            dataset_name=name,
            target_column=target,
            n_splits=num_splits
        )


PROCESSING DATASET: Concrete
Directory for 'Concrete' is ready at 'dataset_xu_2024/Concrete'
--> Attempting to load data from: 'temp/concrete_slump.csv'
--> Successfully loaded with 103 rows.
--> Creating 5 splits for 'Concrete'...
--> Finished saving 5 splits for 'Concrete'.

PROCESSING DATASET: Boston
Directory for 'Boston' is ready at 'dataset_xu_2024/Boston'
--> Attempting to load data from: 'temp/boston_housing.csv'
--> Successfully loaded with 506 rows.
--> Creating 5 splits for 'Boston'...
--> Finished saving 5 splits for 'Boston'.

PROCESSING DATASET: Kin8nm
Directory for 'Kin8nm' is ready at 'dataset_xu_2024/Kin8nm'
--> Attempting to load data from: 'temp/kin8nm.csv'
--> Successfully loaded with 8192 rows.
--> Creating 5 splits for 'Kin8nm'...
--> Finished saving 5 splits for 'Kin8nm'.

PROCESSING DATASET: Yacht
Directory for 'Yacht' is ready at 'dataset_xu_2024/Yacht'
--> Attempting to load data from: 'temp/yacht_hydrodynamics.csv'
--> Successfully loaded with 308 rows.
--> C

Boston dataset found at: svtp_datasets/boston_housing.csv
Boston total lines (including header): 507
name: Boston, shape: (10, 14), columns: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


Concrete dataset found at: svtp_datasets/concrete_slump.csv
Concrete total lines (including header): 104
nam