In [None]:
import dask.dataframe as dd
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.externals import joblib
from dask_ml.model_selection import RandomizedSearchCV, train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
import heapq
import os

# Load data using Dask
# Setup for reading from S3
import s3fs
s3 = s3fs.S3FileSystem(anon=False, key='YOUR_ACCESS_KEY', secret='YOUR_SECRET_KEY')
s3_path = "s3://your-bucket-name/prefix"
df = dd.read_csv(s3_path + "/*.csv", storage_options={"key": "YOUR_ACCESS_KEY", "secret": "YOUR_SECRET_KEY"})

# Separate features and labels
X = df.drop('filter_age', axis=1)
y = df['filter_age']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

# RBF Kernel approximation
rbf_feature = RBFSampler(gamma=1, random_state=1)

# Initialize the SGDClassifier for linear SVM
clf = SGDClassifier(max_iter=1000, tol=1e-3)

# Hyperparameter optimization using RandomizedSearchCV
param_dist = {'alpha': np.logspace(-4, 0, 5),
              'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
              'penalty': ['l2', 'l1', 'elasticnet']}

random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=10, scoring='accuracy', n_jobs=-1, cv=3)

# Use a subset of data for initial hyperparameter tuning
X_sample, y_sample = X_train.sample(frac=0.05).compute(), y_train.sample(frac=0.05).compute()
X_sample_features = rbf_feature.fit_transform(X_sample)

# Initial hyperparameter search on the sample
random_search.fit(X_sample_features, y_sample)
best_hyperparameters = random_search.best_params_

# Initialize the SGDClassifier with the best found hyperparameters
clf = SGDClassifier(max_iter=1000, tol=1e-3, **best_hyperparameters)

# Initialize priority queue for top 3 models
top_models = []

# Train incrementally using Dask chunks
for X_chunk, y_chunk in zip(X_train.to_dask_array(lengths=True).blocks, y_train.to_dask_array(lengths=True).blocks):
    X_chunk_np = X_chunk.compute()
    y_chunk_np = y_chunk.compute()
    
    # Apply the RBF sampler transform
    X_features_chunk = rbf_feature.fit_transform(X_chunk_np)
    
    # Incremental training using partial_fit
    clf.partial_fit(X_features_chunk, y_chunk_np, classes=np.unique(y_chunk_np))
    
    # Evaluate the model's performance on the current chunk
    current_accuracy = accuracy_score(y_chunk_np, clf.predict(X_features_chunk))
    
    # Save the model if it's one of the top 3
    if len(top_models) < 3 or current_accuracy > min([model[0] for model in top_models]):
        heapq.heappush(top_models, (current_accuracy, joblib.dump(clf, "model.pkl")))
        if len(top_models) > 3:
            _, model_path = heapq.heappop(top_models)
            os.remove(model_path)

# Evaluate the best model on the reserved test set using chunks
final_model = heapq.nlargest(1, top_models, key=lambda x: x[0])[0][1]
accuracies = []

for X_chunk, y_chunk in zip(X_test.to_dask_array(lengths=True).blocks, y_test.to_dask_array(lengths=True).blocks):
    X_chunk_np = X_chunk.compute()
    y_chunk_np = y_chunk.compute()
    
    # Apply the RBF sampler transform
    X_features_chunk = rbf_feature.transform(X_chunk_np)
    
    y_pred_chunk = final_model.predict(X_features_chunk)
    accuracies.append(accuracy_score(y_chunk_np, y_pred_chunk))

# Calculate the average accuracy across all chunks
final_accuracy = np.mean(accuracies)
print(f"Final Model Accuracy: {final_accuracy:.4f}")


In [None]:
Handling time-series data in machine learning, especially with models like SVM which don't inherently understand sequence data, requires some careful feature engineering to encapsulate the temporal nature of the data. Here's a series of steps and strategies you can consider:

1. **Segmentation**: If each "cycle of driving" for a car is considered as one sequence, then you need to ensure that each sequence (or segment) is treated as one data point. 

2. **Statistical Features**:
   - For each sequence (driving cycle), you can compute statistical features such as:
     - **Mean, Median**: Average value of features over the cycle.
     - **Standard Deviation, Variance**: Measure of the spread of the feature values.
     - **Min, Max**: Extreme values of the feature in the cycle.
     - **Skewness, Kurtosis**: Measure of the shape of the feature distribution.
     - **Percentiles**: e.g., 25th, 75th percentile can provide insights into the distribution tails.
   
3. **Trend Features**:
   - **Slope**: You can fit a linear regression model to each feature over the time sequence and use the slope as a feature.
   - **Rolling Features**: Compute rolling averages, rolling standard deviations, etc., with different windows (e.g., rolling mean over 10, 50, 100 time steps).

4. **Lagged Features**:
   - For each feature, you can create lagged features. For instance, you could use the value of 'Temperature' from one, two, or more time steps ago as new features. This helps encapsulate the temporal dependence.

5. **Frequency Domain Features**:
   - **FFT (Fast Fourier Transform)**: Convert the time-series data to its frequency components.
   - **Power Spectral Density**: Represents the distribution of power over frequency.

6. **Embeddings from Time-Series Models**:
   - Train models like autoencoders on your time-series data and use the embeddings (encoded features) as input features to your SVM.

7. **Window-based Features**:
   - Instead of treating the entire sequence as one data point, you can create overlapping windows (e.g., of length 50 or 100 time steps) and compute features for each window. This will significantly increase your data points but might give better insights into short-term patterns.

8. **Domain-Specific Features**: Depending on domain knowledge about cars and driving patterns, you might derive features that capture specific behaviors or patterns.

9. **Feature Scaling**: Time-series data, especially when transformed into multiple statistical features, can span different scales. Use Min-Max scaling or Z-score normalization to ensure features are on a similar scale.

10. **Feature Selection**: With many generated features, there's a risk of overfitting. Use techniques like PCA, or feature importance from tree-based models to rank and select the most relevant features.

11. **Temporal Train-Test Split**: Ensure that your training set precedes your test set in time to avoid lookahead bias.

Remember that SVMs, especially with non-linear kernels, can be computationally intensive. With the addition of many new features, the computational cost can increase significantly. Regularization, feature selection, and using approximations (like the one provided by `RBFSampler`) become even more crucial.

Incorporating these feature engineering techniques can help encapsulate the temporal nature of the data and make it more amenable for models like SVMs.

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense

# Parameters
input_sequence_length = 100  # e.g., 100 time steps in each driving cycle
n_features = 8  # e.g., 'Temperature', 'Speed', etc.
embedding_dim = 32  # Dimension of the embedding, can be tuned

# Define the sequence-to-sequence autoencoder

inputs = Input(shape=(100000, 8))
encoded = LSTM(128, return_sequences=True)(inputs)
encoded = LSTM(64, return_sequences=True)(encoded)
encoded = LSTM(32)(encoded)

decoded = RepeatVector(100000)(encoded)
decoded = LSTM(64, return_sequences=True)(decoded)
decoded = LSTM(128, return_sequences=True)(decoded)
decoded = TimeDistributed(Dense(8, activation='linear'))(decoded)

autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoded)

from keras.callbacks import EarlyStopping, ModelCheckpoint

autoencoder.compile(optimizer='adam', loss='mse')

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')

# Assume X_train_timeseries is your training time-series data of shape (n_samples, input_sequence_length, n_features)
autoencoder.fit(X_train_timeseries, X_train_timeseries, 
                epochs=100, 
                batch_size=16, 
                validation_split=0.2, 
                callbacks=[early_stopping, model_checkpoint])





# Once trained, use the encoder to get embeddings for your SVM
X_train_embeddings = encoder.predict(X_train_timeseries)
