In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Load Dataset

In [2]:
columns = ['year'] + [f'timbre_avg_{i}' for i in range(1, 13)] + [f'timbre_cov_{i}' for i in range(1, 79)]

data_path = 'data/raw/dataset.csv'
df = pd.read_csv(data_path, header=None, names=columns)

print(f"Original dataset shape: {df.shape}")

Original dataset shape: (515345, 91)


## 2. Handle Duplicates

In [3]:
duplicates = df.duplicated().sum()
print(f"Duplicate rows before removal: {duplicates}")

df = df.drop_duplicates()
print(f"Dataset shape after removing duplicates: {df.shape}")

Duplicate rows before removal: 214
Dataset shape after removing duplicates: (515131, 91)


## 3. Separate Features and Target

In [4]:
X = df.drop('year', axis=1)
y = df['year']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:")
print(f"  Min year: {y.min()}, Max year: {y.max()}")
print(f"  Mean: {y.mean():.2f}, Std: {y.std():.2f}")

Features shape: (515131, 90)
Target shape: (515131,)

Target distribution:
  Min year: 1922, Max year: 2011
  Mean: 1998.40, Std: 10.93


## 4. Create Train/Validation/Test Splits FIRST

**CRITICAL**: We split BEFORE any transformations to prevent data leakage!

Using **random split** to ensure similar distributions across all sets.

In [5]:

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.15, random_state=42
)

print(f"Data split completed (RANDOM SPLIT):")
print(f"  Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"  Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(df)*100:.1f}%)")
print(f"  Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(df)*100:.1f}%)")

print(f"\nTarget distribution (should be SIMILAR across all sets):")
print(f"  Train: {y_train.min()}-{y_train.max()}, mean={y_train.mean():.1f}, std={y_train.std():.1f}")
print(f"  Val:   {y_val.min()}-{y_val.max()}, mean={y_val.mean():.1f}, std={y_val.std():.1f}")
print(f"  Test:  {y_test.min()}-{y_test.max()}, mean={y_test.mean():.1f}, std={y_test.std():.1f}")

Data split completed (RANDOM SPLIT):
  Training set: 394074 samples (76.5%)
  Validation set: 69543 samples (13.5%)
  Test set: 51514 samples (10.0%)

Target distribution (should be SIMILAR across all sets):
  Train: 1922-2011, mean=1998.4, std=10.9
  Val:   1925-2010, mean=1998.4, std=11.0
  Test:  1924-2010, mean=1998.5, std=10.8


In [6]:
fig = make_subplots(rows=1, cols=3, subplot_titles=['Training Set', 'Validation Set', 'Test Set'])

fig.add_trace(go.Histogram(x=y_train, name='Train', nbinsx=50), row=1, col=1)
fig.add_trace(go.Histogram(x=y_val, name='Validation', nbinsx=50), row=1, col=2)
fig.add_trace(go.Histogram(x=y_test, name='Test', nbinsx=50), row=1, col=3)

fig.update_layout(
    title='Year Distribution Across Splits (Should be Similar!)', 
    template='plotly_white', 
    showlegend=False
)
fig.write_html('reports/figures/12_split_distribution.html')
fig.show()

## 5. Handle Outliers (Using Training Data Bounds ONLY)

**CRITICAL**: Calculate outlier bounds from TRAINING data only, then apply to all sets.

In [7]:
def get_outlier_bounds(train_data, multiplier=3.0):
    """Calculate IQR bounds from training data only."""
    bounds = {}
    for col in train_data.columns:
        Q1 = train_data[col].quantile(0.25)
        Q3 = train_data[col].quantile(0.75)
        IQR = Q3 - Q1
        bounds[col] = {
            'lower': Q1 - multiplier * IQR,
            'upper': Q3 + multiplier * IQR
        }
    return bounds

def apply_outlier_clipping(data, bounds):
    """Apply pre-calculated bounds to clip outliers."""
    data_clipped = data.copy()
    for col in data.columns:
        data_clipped[col] = data[col].clip(bounds[col]['lower'], bounds[col]['upper'])
    return data_clipped

print("Calculating outlier bounds from TRAINING data only...")
outlier_bounds = get_outlier_bounds(X_train, multiplier=3.0)

X_train_clipped = apply_outlier_clipping(X_train, outlier_bounds)
X_val_clipped = apply_outlier_clipping(X_val, outlier_bounds)
X_test_clipped = apply_outlier_clipping(X_test, outlier_bounds)

print("\nOutlier clipping applied (bounds from training data only)")
print(f"Train - Before: [{X_train.min().min():.2f}, {X_train.max().max():.2f}]")
print(f"Train - After:  [{X_train_clipped.min().min():.2f}, {X_train_clipped.max().max():.2f}]")

Calculating outlier bounds from TRAINING data only...

Outlier clipping applied (bounds from training data only)
Train - Before: [-14861.70, 65735.78]
Train - After:  [-2006.04, 8237.72]


## 6. Feature Scaling (Fit on Training Data ONLY)

In [8]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_clipped)
X_val_scaled = scaler.transform(X_val_clipped)
X_test_scaled = scaler.transform(X_test_clipped)

print("StandardScaler applied (fit on training data only)")
print(f"\nTraining set - Mean: {X_train_scaled.mean():.6f}, Std: {X_train_scaled.std():.4f}")
print(f"Validation set - Mean: {X_val_scaled.mean():.6f}, Std: {X_val_scaled.std():.4f}")
print(f"Test set - Mean: {X_test_scaled.mean():.6f}, Std: {X_test_scaled.std():.4f}")

StandardScaler applied (fit on training data only)

Training set - Mean: -0.000000, Std: 1.0000
Validation set - Mean: -0.000644, Std: 0.9961
Test set - Mean: -0.000932, Std: 0.9972


## 7. Save Processed Data

In [9]:
np.save('data/splits/X_train.npy', X_train_scaled)
np.save('data/splits/X_val.npy', X_val_scaled)
np.save('data/splits/X_test.npy', X_test_scaled)

np.save('data/splits/y_train.npy', y_train.values)
np.save('data/splits/y_val.npy', y_val.values)
np.save('data/splits/y_test.npy', y_test.values)

print("Scaled data saved to data/splits/")

Scaled data saved to data/splits/


In [10]:
joblib.dump(scaler, 'models/ml/standard_scaler.joblib')
print("Scaler saved to models/ml/standard_scaler.joblib")

joblib.dump(outlier_bounds, 'models/ml/outlier_bounds.joblib')
print("Outlier bounds saved to models/ml/outlier_bounds.joblib")

feature_names = list(X.columns)
with open('data/processed/feature_names.txt', 'w') as f:
    for name in feature_names:
        f.write(f"{name}\n")
print(f"Feature names saved ({len(feature_names)} features)")

Scaler saved to models/ml/standard_scaler.joblib
Outlier bounds saved to models/ml/outlier_bounds.joblib
Feature names saved (90 features)


In [11]:
preprocessing_summary = {
    'Original samples': len(df) + duplicates,
    'After duplicate removal': len(df),
    'Duplicates removed': duplicates,
    'Train samples': len(X_train),
    'Validation samples': len(X_val),
    'Test samples': len(X_test),
    'Features': X.shape[1],
    'Outlier method': 'IQR clipping (bounds from TRAIN only)',
    'Scaling method': 'StandardScaler (fit on TRAIN only)',
    'Split method': 'Random (similar distribution across sets)',
    'Train year range': f"{y_train.min()}-{y_train.max()}",
    'Val year range': f"{y_val.min()}-{y_val.max()}",
    'Test year range': f"{y_test.min()}-{y_test.max()}"
}

summary_df = pd.DataFrame(list(preprocessing_summary.items()), columns=['Metric', 'Value'])
summary_df.to_csv('reports/metrics/02_preprocessing_summary.csv', index=False)
print("\nPreprocessing Summary:")
for key, value in preprocessing_summary.items():
    print(f"  {key}: {value}")


Preprocessing Summary:
  Original samples: 515345
  After duplicate removal: 515131
  Duplicates removed: 214
  Train samples: 394074
  Validation samples: 69543
  Test samples: 51514
  Features: 90
  Outlier method: IQR clipping (bounds from TRAIN only)
  Scaling method: StandardScaler (fit on TRAIN only)
  Split method: Random (similar distribution across sets)
  Train year range: 1922-2011
  Val year range: 1925-2010
  Test year range: 1924-2010
