# IOT Temperature Dataset - Preprocessing

This notebook preprocesses the IOT Temperature Readings dataset from OpenML.

**Dataset:** Temperature Readings from IOT Devices (OpenML ID: 43351)

**Preprocessing Steps:**
1. Download data from OpenML
2. Feature engineering (temporal features from timestamps)
3. Train/test split BEFORE preprocessing (NO DATA LEAKAGE)
4. Handle missing values with TRAIN statistics only
5. One-hot encoding for categorical variables
6. Save processed data to `/data/processed/`

**Target:** Temperature (°C)

## 1. Setup and Imports

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("✓ Imports successful!")

## 2. Configuration

In [None]:
# Paths
DATA_DIR = Path.cwd().parent / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Parameters
TARGET = 'temp'
TEST_SIZE = 0.2
RANDOM_STATE = 42

print(f"Data directory: {DATA_DIR.resolve()}")
print(f"Output directory: {PROCESSED_DIR.resolve()}")

## 3. Download Data from OpenML

Note: This requires the `openml` package. Install with: `pip install openml`

In [None]:
# Check if openml is installed
try:
    import openml
    print("✓ OpenML package found")
except ImportError:
    print("ERROR: openml not installed.")
    print("Please run: pip install openml")
    raise

# Download dataset
print("\nDownloading IOT Temperature data from OpenML (ID: 43351)...")
dataset = openml.datasets.get_dataset(43351)

print(f"Dataset: {dataset.name}")
print(f"Description: {dataset.description[:200]}...")
print(f"Target: {dataset.default_target_attribute}")

## 4. Load and Explore Data

In [None]:
# Get the data
X, y, categorical_indicator, attribute_names = dataset.get_data(
    target=dataset.default_target_attribute,
    dataset_format='dataframe'
)

# Combine into single dataframe
df = X.copy()
df[TARGET] = y

print(f"\nData shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nTarget statistics:")
print(df[TARGET].describe())

## 5. Data Cleaning

In [None]:
# Remove duplicates
before = len(df)
df = df.drop_duplicates()
after = len(df)

if before > after:
    print(f"Removed {before - after} duplicate rows")
else:
    print("No duplicates found")

print(f"\nFinal dataset: {df.shape[0]} samples, {df.shape[1]} columns")

## 6. Feature Engineering

Extract temporal features from the timestamp column.

In [None]:
# Parse timestamps
if 'noted_date' in df.columns:
    print("Extracting temporal features from 'noted_date'...")
    
    # Convert to datetime
    df['noted_date'] = pd.to_datetime(df['noted_date'], errors='coerce')
    
    # Extract features
    df['hour'] = df['noted_date'].dt.hour
    df['day_of_week'] = df['noted_date'].dt.dayofweek
    df['day_of_month'] = df['noted_date'].dt.day
    df['month'] = df['noted_date'].dt.month
    
    print("  ✓ Extracted: hour, day_of_week, day_of_month, month")
    
    # Drop original timestamp
    df = df.drop('noted_date', axis=1)
    print("  ✓ Dropped original 'noted_date' column")

print(f"\nFeatures after engineering: {df.shape[1]} columns")
print(df.head())

## 7. Separate Features and Target

In [None]:
print("=" * 60)
print("STEP 1: Separate features and target")
print("=" * 60)

# Drop target and ID columns
cols_to_drop = [TARGET]
if 'id' in df.columns:
    cols_to_drop.append('id')
    print("Dropped 'id' column (unique identifier)")

X = df.drop(cols_to_drop, axis=1, errors='ignore')
y = df[TARGET].values.astype(np.float64)

print(f"Features: {X.shape[1]} columns")
print(f"Target: {TARGET} (range: {y.min():.1f}°C - {y.max():.1f}°C)")
print(f"\nFeature columns: {list(X.columns)}")

## 8. Train/Test Split

In [None]:
print("\n" + "=" * 60)
print("STEP 2: Train/test split")
print("=" * 60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print(f"Train: {X_train.shape[0]} samples")
print(f"Test: {X_test.shape[0]} samples")
print(f"\nSplit ratio: {100*(1-TEST_SIZE):.0f}/{100*TEST_SIZE:.0f}")

## 9. Identify Column Types and Store Train Statistics

In [None]:
print("\n" + "=" * 60)
print("STEP 3: Store preprocessing statistics (TRAIN only)")
print("=" * 60)

numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric: {len(numeric_cols)}, Categorical: {len(categorical_cols)}")
print(f"\nNumeric columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")

# Store preprocessing statistics from TRAIN set only
train_stats = {}

# Numeric: medians from TRAIN
for col in numeric_cols:
    train_stats[col] = {'median': X_train[col].median()}

# Categorical: modes and categories from TRAIN
for col in categorical_cols:
    mode_val = X_train[col].mode()
    train_stats[col] = {
        'mode': mode_val[0] if len(mode_val) > 0 else 'Unknown',
        'categories': X_train[col].dropna().unique().tolist()
    }

print(f"\nStored stats for {len(train_stats)} columns (from TRAIN only)")

## 10. Apply Preprocessing

Using TRAIN statistics to transform both train and test sets.

In [None]:
print("\n" + "=" * 60)
print("STEP 4: Apply preprocessing")
print("=" * 60)

def preprocess(X_df, stats, num_cols, cat_cols):
    """Apply preprocessing using pre-computed statistics."""
    X_out = X_df.copy()
    
    # Fill missing numeric with TRAIN median
    for col in num_cols:
        if col in stats:
            X_out[col] = X_out[col].fillna(stats[col]['median'])
    
    # Fill missing categorical with TRAIN mode
    # Replace unseen categories with mode
    for col in cat_cols:
        if col in stats:
            mode_val = stats[col]['mode']
            X_out[col] = X_out[col].fillna(mode_val)
            known = stats[col]['categories']
            X_out[col] = X_out[col].apply(lambda x: x if x in known else mode_val)
    
    return X_out

X_train_clean = preprocess(X_train, train_stats, numeric_cols, categorical_cols)
X_test_clean = preprocess(X_test, train_stats, numeric_cols, categorical_cols)

print(f"Missing after preprocessing:")
print(f"  Train: {X_train_clean.isnull().sum().sum()}")
print(f"  Test: {X_test_clean.isnull().sum().sum()}")

## 11. One-Hot Encoding

In [None]:
print("\n" + "=" * 60)
print("STEP 5: One-Hot Encoding")
print("=" * 60)

X_train_encoded = pd.get_dummies(
    X_train_clean,
    columns=categorical_cols,
    drop_first=True,
    dtype=int
)

X_test_encoded = pd.get_dummies(
    X_test_clean,
    columns=categorical_cols,
    drop_first=True,
    dtype=int
)

# Align columns (test might have different dummies)
train_cols = X_train_encoded.columns.tolist()

for col in train_cols:
    if col not in X_test_encoded.columns:
        X_test_encoded[col] = 0

X_test_encoded = X_test_encoded[train_cols]

print(f"Final shapes:")
print(f"  Train: {X_train_encoded.shape}")
print(f"  Test: {X_test_encoded.shape}")
print(f"\nFinal features: {list(X_train_encoded.columns)}")

## 12. Convert to NumPy Arrays

In [None]:
# Convert to numpy arrays
X_train_arr = X_train_encoded.values.astype(np.float64)
X_test_arr = X_test_encoded.values.astype(np.float64)
feature_names = X_train_encoded.columns.tolist()

# Combined (for analysis notebooks)
X_combined = np.vstack([X_train_arr, X_test_arr])
y_combined = np.concatenate([y_train, y_test])

print(f"Array shapes:")
print(f"  X_train: {X_train_arr.shape}")
print(f"  X_test: {X_test_arr.shape}")
print(f"  X_combined: {X_combined.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  y_test: {y_test.shape}")
print(f"  y_combined: {y_combined.shape}")

## 13. Save Processed Data

In [None]:
print("\n" + "=" * 60)
print("SAVING PROCESSED DATA")
print("=" * 60)

# Save combined
np.save(PROCESSED_DIR / 'iot_temp_X.npy', X_combined)
np.save(PROCESSED_DIR / 'iot_temp_y.npy', y_combined)
np.save(PROCESSED_DIR / 'iot_temp_feature_names.npy', np.array(feature_names))

# Save train/test separately
np.save(PROCESSED_DIR / 'iot_temp_X_train.npy', X_train_arr)
np.save(PROCESSED_DIR / 'iot_temp_X_test.npy', X_test_arr)
np.save(PROCESSED_DIR / 'iot_temp_y_train.npy', y_train)
np.save(PROCESSED_DIR / 'iot_temp_y_test.npy', y_test)

print(f"Saved to {PROCESSED_DIR}:")
print(f"  Combined: X={X_combined.shape}, y={y_combined.shape}")
print(f"  Train: X={X_train_arr.shape}")
print(f"  Test: X={X_test_arr.shape}")
print(f"  Features: {len(feature_names)}")

# Also save a CSV version for easy inspection
csv_path = PROCESSED_DIR / 'iot_temp_processed.csv'
df_combined = pd.DataFrame(X_combined, columns=feature_names)
df_combined[TARGET] = y_combined
df_combined.to_csv(csv_path, index=False)

print(f"  CSV: {csv_path.name}")

## 14. Summary

In [None]:
print("\n" + "=" * 60)
print("PREPROCESSING COMPLETE!")
print("=" * 60)
print(f"\nDataset: IOT Temperature Readings")
print(f"Total: {X_combined.shape[0]} samples, {X_combined.shape[1]} features")
print(f"Target: {TARGET} (range: {y_combined.min():.1f}°C - {y_combined.max():.1f}°C)")
print(f"\n✓ NO DATA LEAKAGE:")
print(f"  - Train/test split done BEFORE preprocessing")
print(f"  - Missing values filled with TRAIN statistics only")
print(f"  - Categorical encoding based on TRAIN categories")
print(f"  - Temporal features extracted from timestamps")
print(f"\n✓ Files saved to: {PROCESSED_DIR}")
print(f"\n✓ Ready for analysis!")

## 15. Verification - Load and Check Saved Data

In [None]:
# Verify saved files
print("Verifying saved files...\n")

X_train_loaded = np.load(PROCESSED_DIR / 'iot_temp_X_train.npy')
X_test_loaded = np.load(PROCESSED_DIR / 'iot_temp_X_test.npy')
y_train_loaded = np.load(PROCESSED_DIR / 'iot_temp_y_train.npy')
y_test_loaded = np.load(PROCESSED_DIR / 'iot_temp_y_test.npy')
features_loaded = np.load(PROCESSED_DIR / 'iot_temp_feature_names.npy', allow_pickle=True)

print("✓ All files loaded successfully!")
print(f"\nLoaded shapes:")
print(f"  X_train: {X_train_loaded.shape}")
print(f"  X_test: {X_test_loaded.shape}")
print(f"  y_train: {y_train_loaded.shape}")
print(f"  y_test: {y_test_loaded.shape}")
print(f"  features: {len(features_loaded)}")